lychee_lib/checker/file.rs
1use http::StatusCode;
2use log::warn;
3use std::borrow::Cow;
4use std::path::{Path, PathBuf};
5
6use crate::checker::wikilink::resolver::WikilinkResolver;
7use crate::{
8 Base, ErrorKind, Result, Status, Uri,
9 utils::fragment_checker::{FragmentChecker, FragmentInput},
10};
11
12/// A utility for checking the existence and validity of file-based URIs.
13///
14/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
15/// It supports base path resolution, fallback extensions for files without extensions,
16/// and optional fragment checking for HTML files.
17#[derive(Debug, Clone)]
18pub(crate) struct FileChecker {
19 /// Base path or URL used for resolving relative paths.
20 base: Option<Base>,
21 /// List of file extensions to try if the original path doesn't exist.
22 fallback_extensions: Vec<String>,
23 /// If specified, resolves to one of the given index files if the original path
24 /// is a directory.
25 ///
26 /// If non-`None`, a directory must contain at least one of the file names
27 /// in order to be considered a valid link target. Index files names are
28 /// required to match regular files, aside from the special `.` name which
29 /// will match the directory itself.
30 ///
31 /// If `None`, index file checking is disabled and directory links are valid
32 /// as long as the directory exists on disk.
33 index_files: Option<Vec<String>>,
34 /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
35 include_fragments: bool,
36 /// Utility for performing fragment checks in HTML files.
37 fragment_checker: FragmentChecker,
38 /// Utility for optionally resolving Wikilinks.
39 wikilink_resolver: Option<WikilinkResolver>,
40}
41
42impl FileChecker {
43 /// Creates a new `FileChecker` with the given configuration.
44 ///
45 /// # Arguments
46 ///
47 /// * `base` - Optional base path or URL for resolving relative paths.
48 /// * `fallback_extensions` - List of extensions to try if the original file is not found.
49 /// * `index_files` - Optional list of index file names to search for if the path is a directory.
50 /// * `include_fragments` - Whether to check for fragment existence in HTML files.
51 /// * `include_wikilinks` - Whether to check the existence of Wikilinks found in Markdown files .
52 ///
53 /// # Errors
54 ///
55 /// Fails if an invalid `base` is provided when including wikilinks.
56 pub(crate) fn new(
57 base: Option<Base>,
58 fallback_extensions: Vec<String>,
59 index_files: Option<Vec<String>>,
60 include_fragments: bool,
61 include_wikilinks: bool,
62 ) -> Result<Self> {
63 let wikilink_resolver = if include_wikilinks {
64 Some(WikilinkResolver::new(
65 base.as_ref(),
66 fallback_extensions.clone(),
67 )?)
68 } else {
69 None
70 };
71
72 Ok(Self {
73 base,
74 fallback_extensions,
75 index_files,
76 include_fragments,
77 fragment_checker: FragmentChecker::new(),
78 wikilink_resolver,
79 })
80 }
81
82 /// Checks the given file URI for existence and validity.
83 ///
84 /// This method resolves the URI to a file path, checks if the file exists,
85 /// and optionally checks for the existence of fragments in HTML files.
86 ///
87 /// # Arguments
88 ///
89 /// * `uri` - The URI to check.
90 ///
91 /// # Returns
92 ///
93 /// Returns a `Status` indicating the result of the check.
94 pub(crate) async fn check(&self, uri: &Uri) -> Status {
95 let Ok(path) = uri.url.to_file_path() else {
96 return ErrorKind::InvalidFilePath(uri.clone()).into();
97 };
98
99 let path = self.resolve_base(&path);
100 let path = self.resolve_local_path(&path, uri);
101 match path {
102 Ok(path) => self.check_file(path.as_ref(), uri).await,
103 Err(err) => err.into(),
104 }
105 }
106
107 /// Resolves the given path using the base path, if one is set.
108 ///
109 /// # Arguments
110 ///
111 /// * `path` - The path to resolve.
112 ///
113 /// # Returns
114 ///
115 /// Returns the resolved path as a `PathBuf`, or the original path
116 /// if no base path is defined.
117 fn resolve_base(&self, path: &Path) -> PathBuf {
118 if let Some(Base::Local(base_path)) = &self.base {
119 if path.is_absolute() {
120 let absolute_base_path = if base_path.is_relative() {
121 std::env::current_dir().unwrap_or_default().join(base_path)
122 } else {
123 base_path.clone()
124 };
125
126 let stripped = path.strip_prefix("/").unwrap_or(path);
127 absolute_base_path.join(stripped)
128 } else {
129 base_path.join(path)
130 }
131 } else {
132 path.to_path_buf()
133 }
134 }
135
136 /// Resolves the given local path by applying logic which is specific to local file
137 /// checking - currently, this includes fallback extensions and index files.
138 ///
139 /// # Arguments
140 ///
141 /// * `path` - The path to check. Need not exist.
142 /// * `uri` - The original URI, used for error reporting.
143 ///
144 /// # Returns
145 ///
146 /// Returns `Ok` with the resolved path if it is valid, otherwise returns
147 /// `Err` with an appropriate error. The returned path, if any, is guaranteed
148 /// to exist and may be a file or a directory.
149 fn resolve_local_path<'a>(&self, path: &'a Path, uri: &Uri) -> Result<Cow<'a, Path>> {
150 let path = match path.metadata() {
151 // for non-existing paths, attempt fallback extensions
152 // if fallback extensions don't help, try wikilinks
153 Err(e) if e.kind() == std::io::ErrorKind::NotFound => self
154 .apply_fallback_extensions(path, uri)
155 .or_else(|_| {
156 if let Some(resolver) = &self.wikilink_resolver {
157 resolver.resolve(path, uri)
158 } else {
159 Err(ErrorKind::InvalidFilePath(uri.clone()))
160 }
161 })
162 .map(Cow::Owned),
163
164 // other IO errors are unexpected and should fail the check
165 Err(e) => Err(ErrorKind::ReadFileInput(e, path.to_path_buf())),
166
167 // existing directories are resolved via index files
168 Ok(meta) if meta.is_dir() => self.apply_index_files(path).map(Cow::Owned),
169
170 // otherwise, path is an existing file - just return the path
171 Ok(_) => Ok(Cow::Borrowed(path)),
172 };
173
174 // if initial resolution results in a directory, also attempts to apply
175 // fallback extensions. probably, this always makes sense because
176 // directories are treated as having no fragments, so a real file with
177 // a fallback extension (if it exists) will potentially contain more
178 // fragments and thus be "more useful".
179 //
180 // (currently, this case is only reachable if `.` is in the index_files list.)
181 match path {
182 Ok(dir_path) if dir_path.is_dir() => self
183 .apply_fallback_extensions(&dir_path, uri)
184 .map(Cow::Owned)
185 .or(Ok(dir_path)),
186 Ok(path) => Ok(path),
187 Err(err) => Err(err),
188 }
189 }
190
191 /// Resolves a path to a file, applying fallback extensions if necessary.
192 ///
193 /// This function will try to find a file, first by attempting the given path
194 /// itself, then by attempting the path with each extension from
195 /// [`FileChecker::fallback_extensions`]. The first existing file (not directory),
196 /// if any, will be returned.
197 ///
198 /// # Arguments
199 ///
200 /// * `path` - The path to resolve.
201 /// * `uri` - The original URI, used for error reporting.
202 ///
203 /// # Returns
204 ///
205 /// Returns `Ok(PathBuf)` with the resolved file path, or `Err` if no valid file is found.
206 /// If `Ok` is returned, the contained `PathBuf` is guaranteed to exist and be a file.
207 fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result<PathBuf> {
208 // If it's already a file, use it directly
209 if path.is_file() {
210 return Ok(path.to_path_buf());
211 }
212
213 // Try fallback extensions
214 let mut path_buf = path.to_path_buf();
215 for ext in &self.fallback_extensions {
216 path_buf.set_extension(ext);
217 if path_buf.is_file() {
218 return Ok(path_buf);
219 }
220 }
221
222 Err(ErrorKind::InvalidFilePath(uri.clone()))
223 }
224
225 /// Tries to find an index file in the given directory, returning the first match.
226 /// The index file behavior is specified by [`FileChecker::index_files`].
227 ///
228 /// If this is non-`None`, index files must exist and resolved index files are
229 /// required to be files, aside from the special name `.` - this will match the
230 /// directory itself.
231 ///
232 /// If `None`, index file resolution is disabled and this function simply
233 /// returns the given path.
234 ///
235 /// # Arguments
236 ///
237 /// * `dir_path` - The directory within which to search for index files.
238 /// This is assumed to be an existing directory.
239 ///
240 /// # Returns
241 ///
242 /// Returns `Ok(PathBuf)` pointing to the first existing index file, or
243 /// `Err` if no index file is found. If `Ok` is returned, the contained `PathBuf`
244 /// is guaranteed to exist. In most cases, the returned path will be a file path.
245 ///
246 /// If index files are disabled, simply returns `Ok(dir_path)`.
247 fn apply_index_files(&self, dir_path: &Path) -> Result<PathBuf> {
248 // this implements the "disabled" case by treating a directory as its
249 // own index file.
250 let index_names_to_try = match &self.index_files {
251 Some(names) => &names[..],
252 None => &[".".to_owned()],
253 };
254
255 let invalid_index_error = || {
256 // Drop empty index file names. These will never be accepted as valid
257 // index files, and doing this makes cleaner error reporting.
258 let mut names = index_names_to_try.to_vec();
259 names.retain(|x| !x.is_empty());
260
261 ErrorKind::InvalidIndexFile(names)
262 };
263
264 index_names_to_try
265 .iter()
266 .find_map(|filename| {
267 // for some special index file names, we accept directories as well
268 // as files.
269 let exists = match filename.as_str() {
270 "." => Path::exists,
271 _ => Path::is_file,
272 };
273
274 let path = dir_path.join(filename);
275 exists(&path).then_some(path)
276 })
277 .ok_or_else(invalid_index_error)
278 }
279
280 /// Checks a resolved file, optionally verifying fragments for HTML files.
281 ///
282 /// # Arguments
283 ///
284 /// * `path` - The resolved path to check.
285 /// * `uri` - The original URI, used for error reporting.
286 ///
287 /// # Returns
288 ///
289 /// Returns a `Status` indicating the result of the check.
290 async fn check_file(&self, path: &Path, uri: &Uri) -> Status {
291 if self.include_fragments {
292 self.check_fragment(path, uri).await
293 } else {
294 Status::Ok(StatusCode::OK)
295 }
296 }
297
298 /// Checks for the existence of a fragment in a path.
299 ///
300 /// The given path may be a file or a directory. A directory
301 /// is treated as if it was an empty file with no fragments.
302 ///
303 /// # Arguments
304 ///
305 /// * `path` - The path to the file or directory. Assumed to exist.
306 /// * `uri` - The original URI, containing the fragment to check.
307 ///
308 /// # Returns
309 ///
310 /// Returns a `Status` indicating the result of the fragment check.
311 async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
312 // for absent or trivial fragments, always return success.
313 if uri.url.fragment().is_none_or(str::is_empty) {
314 return Status::Ok(StatusCode::OK);
315 }
316
317 // directories are treated as if they were a file with no fragments.
318 // reaching here means we have a non-trivial fragment on a directory,
319 // so return error.
320 if path.is_dir() {
321 return ErrorKind::InvalidFragment(uri.clone()).into();
322 }
323
324 match FragmentInput::from_path(path).await {
325 Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
326 Ok(true) => Status::Ok(StatusCode::OK),
327 Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
328 Err(err) => {
329 warn!("Skipping fragment check for {uri} due to the following error: {err}");
330 Status::Ok(StatusCode::OK)
331 }
332 },
333 Err(err) => {
334 warn!("Skipping fragment check for {uri} due to the following error: {err}");
335 Status::Ok(StatusCode::OK)
336 }
337 }
338 }
339}
340
341#[cfg(test)]
342mod tests {
343 use super::FileChecker;
344 use crate::{
345 ErrorKind::{InvalidFilePath, InvalidFragment, InvalidIndexFile},
346 Status, Uri,
347 };
348 use test_utils::{fixture_uri, fixtures_path};
349
350 /// Calls [`FileChecker::check`] on the given [`FileChecker`] with given URL
351 /// path (relative to the fixtures directory).
352 ///
353 /// The result of checking the link is matched against the given pattern.
354 macro_rules! assert_filecheck {
355 ($checker:expr, $path:expr, $pattern:pat) => {
356 let uri = Uri::from(fixture_uri!($path));
357 let result = $checker.check(&uri).await;
358 assert!(
359 matches!(result, $pattern),
360 "assertion failed: {} should be {} but was '{:?}'",
361 &uri,
362 stringify!($pattern),
363 &result
364 );
365 };
366 }
367
368 /// Calls [`FileChecker::resolve_local_path`] on the given [`FileChecker`]
369 /// with given URL path (relative to the fixtures directory).
370 ///
371 /// The result of resolving the link is matched against the given pattern.
372 /// The pattern should match values of type `Result<&str, ErrorKind>`.
373 macro_rules! assert_resolves {
374 ($checker:expr, $subpath:expr, $expected:pat) => {
375 let uri = Uri::from(fixture_uri!($subpath));
376 let path = uri
377 .url
378 .to_file_path()
379 .expect("fixture uri should be a valid path");
380 let result = $checker.resolve_local_path(&path, &uri);
381 let result_subpath = result
382 .as_deref()
383 .map(|p| p.strip_prefix(fixtures_path!()).unwrap())
384 .map(|p| p.to_string_lossy());
385 assert!(
386 matches!(result_subpath.as_deref(), $expected),
387 "{:?} resolved to {:?} but should be {}",
388 $subpath,
389 result_subpath,
390 stringify!($expected)
391 );
392 };
393 }
394
395 #[tokio::test]
396 async fn test_default() {
397 // default behaviour accepts dir links as long as the directory exists.
398 let checker = FileChecker::new(None, vec![], None, true, false).unwrap();
399
400 assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_));
401
402 // empty dir is accepted with '.' in index_files, but it contains no fragments.
403 assert_resolves!(
404 &checker,
405 "filechecker/empty_dir",
406 Ok("filechecker/empty_dir")
407 );
408 assert_filecheck!(&checker, "filechecker/empty_dir", Status::Ok(_));
409 assert_filecheck!(&checker, "filechecker/empty_dir#", Status::Ok(_));
410 assert_filecheck!(
411 &checker,
412 "filechecker/empty_dir#fragment",
413 Status::Error(InvalidFragment(_))
414 );
415
416 // even though index.html is present, it is not used because index_files is only
417 // '.', so no fragments are found.
418 assert_resolves!(
419 &checker,
420 "filechecker/index_dir",
421 Ok("filechecker/index_dir")
422 );
423 assert_filecheck!(
424 &checker,
425 "filechecker/index_dir#fragment",
426 Status::Error(InvalidFragment(_))
427 );
428 assert_filecheck!(
429 &checker,
430 "filechecker/index_dir#non-existingfragment",
431 Status::Error(InvalidFragment(_))
432 );
433
434 assert_filecheck!(&checker, "filechecker/same_name", Status::Ok(_));
435
436 // because no fallback extensions are configured
437 assert_resolves!(
438 &checker,
439 "filechecker/same_name",
440 Ok("filechecker/same_name")
441 );
442 assert_filecheck!(
443 &checker,
444 "filechecker/same_name#a",
445 Status::Error(InvalidFragment(_))
446 );
447 }
448
449 #[tokio::test]
450 async fn test_index_files() {
451 let checker = FileChecker::new(
452 None,
453 vec![],
454 Some(vec!["index.html".to_owned(), "index.md".to_owned()]),
455 true,
456 false,
457 )
458 .unwrap();
459
460 assert_resolves!(
461 &checker,
462 "filechecker/index_dir",
463 Ok("filechecker/index_dir/index.html")
464 );
465 assert_resolves!(
466 &checker,
467 "filechecker/index_md",
468 Ok("filechecker/index_md/index.md")
469 );
470 // empty is rejected because of no index.html
471 assert_resolves!(&checker, "filechecker/empty_dir", Err(InvalidIndexFile(_)));
472
473 // index.html is resolved and fragments are checked.
474 assert_filecheck!(&checker, "filechecker/index_dir#fragment", Status::Ok(_));
475 assert_filecheck!(
476 &checker,
477 "filechecker/index_dir#non-existingfragment",
478 Status::Error(InvalidFragment(_))
479 );
480
481 // directories which look like files should still have index files applied
482 assert_resolves!(
483 &checker,
484 "filechecker/dir_with_extension.html",
485 Err(InvalidIndexFile(_))
486 );
487 }
488
489 #[tokio::test]
490 async fn test_both_fallback_and_index_corner() {
491 let checker = FileChecker::new(
492 None,
493 vec!["html".to_owned()],
494 Some(vec!["index".to_owned()]),
495 false,
496 false,
497 )
498 .unwrap();
499
500 // this test case has a subdir 'same_name' and a file 'same_name.html'.
501 // this shows that the index file resolving is applied in this case and
502 // fallback extensions are not applied.
503 assert_resolves!(&checker, "filechecker/same_name", Err(InvalidIndexFile(_)));
504
505 // this directory has an index.html, but the index_files argument is only "index". this
506 // shows that fallback extensions are not applied to index file names, as the index.html is
507 // not found.
508 assert_resolves!(&checker, "filechecker/index_dir", Err(InvalidIndexFile(_)));
509
510 // a directory called 'dir_with_extension.html' exists. this test shows that fallback
511 // extensions must resolve to a file not a directory.
512 assert_resolves!(
513 &checker,
514 "filechecker/dir_with_extension",
515 Err(InvalidFilePath(_))
516 );
517 }
518
519 #[tokio::test]
520 async fn test_empty_index_list_corner() {
521 // empty index_files list will reject all directory links
522 let checker_no_indexes =
523 FileChecker::new(None, vec![], Some(vec![]), false, false).unwrap();
524 assert_resolves!(
525 &checker_no_indexes,
526 "filechecker/index_dir",
527 Err(InvalidIndexFile(_))
528 );
529 assert_resolves!(
530 &checker_no_indexes,
531 "filechecker/empty_dir",
532 Err(InvalidIndexFile(_))
533 );
534 }
535
536 #[tokio::test]
537 async fn test_index_list_of_directories_corner() {
538 // this test defines index_files to be a list of different names, all of which will
539 // resolve to an existing directory. however, because they are directories and not
540 // the special '.' name, these should not be accepted as valid index files.
541 let dir_names = vec![
542 String::new(),
543 "./.".to_owned(),
544 "..".to_owned(),
545 "/".to_owned(),
546 ];
547 let checker_dir_indexes =
548 FileChecker::new(None, vec![], Some(dir_names), false, false).unwrap();
549 assert_resolves!(
550 &checker_dir_indexes,
551 "filechecker/index_dir",
552 Err(InvalidIndexFile(_))
553 );
554 assert_resolves!(
555 &checker_dir_indexes,
556 "filechecker/empty_dir",
557 Err(InvalidIndexFile(_))
558 );
559 }
560
561 #[tokio::test]
562 async fn test_index_file_traversal_corner() {
563 // index file names can contain path fragments and they will be traversed.
564 let checker_dotdot = FileChecker::new(
565 None,
566 vec![],
567 Some(vec!["../index_dir/index.html".to_owned()]),
568 true,
569 false,
570 )
571 .unwrap();
572 assert_resolves!(
573 &checker_dotdot,
574 "filechecker/empty_dir#fragment",
575 Ok("filechecker/empty_dir/../index_dir/index.html")
576 );
577
578 // absolute paths to a file on disk should also work
579 let absolute_html = fixtures_path!()
580 .join("filechecker/index_dir/index.html")
581 .to_str()
582 .expect("expected utf-8 fixtures path")
583 .to_owned();
584 let checker_absolute =
585 FileChecker::new(None, vec![], Some(vec![absolute_html]), true, false).unwrap();
586 assert_resolves!(
587 &checker_absolute,
588 "filechecker/empty_dir#fragment",
589 Ok("filechecker/index_dir/index.html")
590 );
591 }
592
593 #[tokio::test]
594 async fn test_fallback_extensions_on_directories() {
595 let checker = FileChecker::new(None, vec!["html".to_owned()], None, true, false).unwrap();
596
597 // fallback extensions should be applied when directory links are resolved
598 // to directories (i.e., the default index_files behavior or if `.`
599 // appears in index_files).
600 assert_resolves!(
601 &checker,
602 "filechecker/same_name#a",
603 Ok("filechecker/same_name.html")
604 );
605
606 // currently, trailing slashes are ignored and fallback extensions are
607 // applied regardless. maybe links with trailing slash should be prevented
608 // from resolving to files.
609 assert_resolves!(
610 &checker,
611 "filechecker/same_name/",
612 Ok("filechecker/same_name.html")
613 );
614 }
615}