Skip to main content

lychee_lib/checker/
file.rs

1use http::StatusCode;
2use log::warn;
3use std::borrow::Cow;
4use std::path::{Path, PathBuf};
5
6use crate::checker::wikilink::resolver::WikilinkResolver;
7use crate::{
8    Base, ErrorKind, Result, Status, Uri,
9    utils::fragment_checker::{FragmentChecker, FragmentInput},
10};
11
12/// A utility for checking the existence and validity of file-based URIs.
13///
14/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
15/// It supports base path resolution, fallback extensions for files without extensions,
16/// and optional fragment checking for HTML files.
17#[derive(Debug, Clone)]
18pub(crate) struct FileChecker {
19    /// Base path or URL used for resolving relative paths.
20    base: Option<Base>,
21    /// List of file extensions to try if the original path doesn't exist.
22    fallback_extensions: Vec<String>,
23    /// If specified, resolves to one of the given index files if the original path
24    /// is a directory.
25    ///
26    /// If non-`None`, a directory must contain at least one of the file names
27    /// in order to be considered a valid link target. Index files names are
28    /// required to match regular files, aside from the special `.` name which
29    /// will match the directory itself.
30    ///
31    /// If `None`, index file checking is disabled and directory links are valid
32    /// as long as the directory exists on disk.
33    index_files: Option<Vec<String>>,
34    /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
35    include_fragments: bool,
36    /// Utility for performing fragment checks in HTML files.
37    fragment_checker: FragmentChecker,
38    /// Utility for optionally resolving Wikilinks.
39    wikilink_resolver: Option<WikilinkResolver>,
40}
41
42impl FileChecker {
43    /// Creates a new `FileChecker` with the given configuration.
44    ///
45    /// # Arguments
46    ///
47    /// * `base` - Optional base path or URL for resolving relative paths.
48    /// * `fallback_extensions` - List of extensions to try if the original file is not found.
49    /// * `index_files` - Optional list of index file names to search for if the path is a directory.
50    /// * `include_fragments` - Whether to check for fragment existence in HTML files.
51    /// * `include_wikilinks` - Whether to check the existence of Wikilinks found in Markdown files .
52    ///
53    /// # Errors
54    ///
55    /// Fails if an invalid `base` is provided when including wikilinks.
56    pub(crate) fn new(
57        base: Option<Base>,
58        fallback_extensions: Vec<String>,
59        index_files: Option<Vec<String>>,
60        include_fragments: bool,
61        include_wikilinks: bool,
62    ) -> Result<Self> {
63        let wikilink_resolver = if include_wikilinks {
64            Some(WikilinkResolver::new(
65                base.as_ref(),
66                fallback_extensions.clone(),
67            )?)
68        } else {
69            None
70        };
71
72        Ok(Self {
73            base,
74            fallback_extensions,
75            index_files,
76            include_fragments,
77            fragment_checker: FragmentChecker::new(),
78            wikilink_resolver,
79        })
80    }
81
82    /// Checks the given file URI for existence and validity.
83    ///
84    /// This method resolves the URI to a file path, checks if the file exists,
85    /// and optionally checks for the existence of fragments in HTML files.
86    ///
87    /// # Arguments
88    ///
89    /// * `uri` - The URI to check.
90    ///
91    /// # Returns
92    ///
93    /// Returns a `Status` indicating the result of the check.
94    pub(crate) async fn check(&self, uri: &Uri) -> Status {
95        let Ok(path) = uri.url.to_file_path() else {
96            return ErrorKind::InvalidFilePath(uri.clone()).into();
97        };
98
99        let path = self.resolve_base(&path);
100        let path = self.resolve_local_path(&path, uri);
101        match path {
102            Ok(path) => self.check_file(path.as_ref(), uri).await,
103            Err(err) => err.into(),
104        }
105    }
106
107    /// Resolves the given path using the base path, if one is set.
108    ///
109    /// # Arguments
110    ///
111    /// * `path` - The path to resolve.
112    ///
113    /// # Returns
114    ///
115    /// Returns the resolved path as a `PathBuf`, or the original path
116    /// if no base path is defined.
117    fn resolve_base(&self, path: &Path) -> PathBuf {
118        if let Some(Base::Local(base_path)) = &self.base {
119            if path.is_absolute() {
120                let absolute_base_path = if base_path.is_relative() {
121                    std::env::current_dir().unwrap_or_default().join(base_path)
122                } else {
123                    base_path.clone()
124                };
125
126                let stripped = path.strip_prefix("/").unwrap_or(path);
127                absolute_base_path.join(stripped)
128            } else {
129                base_path.join(path)
130            }
131        } else {
132            path.to_path_buf()
133        }
134    }
135
136    /// Resolves the given local path by applying logic which is specific to local file
137    /// checking - currently, this includes fallback extensions and index files.
138    ///
139    /// # Arguments
140    ///
141    /// * `path` - The path to check. Need not exist.
142    /// * `uri` - The original URI, used for error reporting.
143    ///
144    /// # Returns
145    ///
146    /// Returns `Ok` with the resolved path if it is valid, otherwise returns
147    /// `Err` with an appropriate error. The returned path, if any, is guaranteed
148    /// to exist and may be a file or a directory.
149    fn resolve_local_path<'a>(&self, path: &'a Path, uri: &Uri) -> Result<Cow<'a, Path>> {
150        let path = match path.metadata() {
151            // for non-existing paths, attempt fallback extensions
152            // if fallback extensions don't help, try wikilinks
153            Err(e) if e.kind() == std::io::ErrorKind::NotFound => self
154                .apply_fallback_extensions(path, uri)
155                .or_else(|_| {
156                    if let Some(resolver) = &self.wikilink_resolver {
157                        resolver.resolve(path, uri)
158                    } else {
159                        Err(ErrorKind::InvalidFilePath(uri.clone()))
160                    }
161                })
162                .map(Cow::Owned),
163
164            // other IO errors are unexpected and should fail the check
165            Err(e) => Err(ErrorKind::ReadFileInput(e, path.to_path_buf())),
166
167            // existing directories are resolved via index files
168            Ok(meta) if meta.is_dir() => self.apply_index_files(path).map(Cow::Owned),
169
170            // otherwise, path is an existing file - just return the path
171            Ok(_) => Ok(Cow::Borrowed(path)),
172        };
173
174        // if initial resolution results in a directory, also attempts to apply
175        // fallback extensions. probably, this always makes sense because
176        // directories are treated as having no fragments, so a real file with
177        // a fallback extension (if it exists) will potentially contain more
178        // fragments and thus be "more useful".
179        //
180        // (currently, this case is only reachable if `.` is in the index_files list.)
181        match path {
182            Ok(dir_path) if dir_path.is_dir() => self
183                .apply_fallback_extensions(&dir_path, uri)
184                .map(Cow::Owned)
185                .or(Ok(dir_path)),
186            Ok(path) => Ok(path),
187            Err(err) => Err(err),
188        }
189    }
190
191    /// Resolves a path to a file, applying fallback extensions if necessary.
192    ///
193    /// This function will try to find a file, first by attempting the given path
194    /// itself, then by attempting the path with each extension from
195    /// [`FileChecker::fallback_extensions`]. The first existing file (not directory),
196    /// if any, will be returned.
197    ///
198    /// # Arguments
199    ///
200    /// * `path` - The path to resolve.
201    /// * `uri` - The original URI, used for error reporting.
202    ///
203    /// # Returns
204    ///
205    /// Returns `Ok(PathBuf)` with the resolved file path, or `Err` if no valid file is found.
206    /// If `Ok` is returned, the contained `PathBuf` is guaranteed to exist and be a file.
207    fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result<PathBuf> {
208        // If it's already a file, use it directly
209        if path.is_file() {
210            return Ok(path.to_path_buf());
211        }
212
213        // Try fallback extensions
214        let mut path_buf = path.to_path_buf();
215        for ext in &self.fallback_extensions {
216            path_buf.set_extension(ext);
217            if path_buf.is_file() {
218                return Ok(path_buf);
219            }
220        }
221
222        Err(ErrorKind::InvalidFilePath(uri.clone()))
223    }
224
225    /// Tries to find an index file in the given directory, returning the first match.
226    /// The index file behavior is specified by [`FileChecker::index_files`].
227    ///
228    /// If this is non-`None`, index files must exist and resolved index files are
229    /// required to be files, aside from the special name `.` - this will match the
230    /// directory itself.
231    ///
232    /// If `None`, index file resolution is disabled and this function simply
233    /// returns the given path.
234    ///
235    /// # Arguments
236    ///
237    /// * `dir_path` - The directory within which to search for index files.
238    ///   This is assumed to be an existing directory.
239    ///
240    /// # Returns
241    ///
242    /// Returns `Ok(PathBuf)` pointing to the first existing index file, or
243    /// `Err` if no index file is found. If `Ok` is returned, the contained `PathBuf`
244    /// is guaranteed to exist. In most cases, the returned path will be a file path.
245    ///
246    /// If index files are disabled, simply returns `Ok(dir_path)`.
247    fn apply_index_files(&self, dir_path: &Path) -> Result<PathBuf> {
248        // this implements the "disabled" case by treating a directory as its
249        // own index file.
250        let index_names_to_try = match &self.index_files {
251            Some(names) => &names[..],
252            None => &[".".to_owned()],
253        };
254
255        let invalid_index_error = || {
256            // Drop empty index file names. These will never be accepted as valid
257            // index files, and doing this makes cleaner error reporting.
258            let mut names = index_names_to_try.to_vec();
259            names.retain(|x| !x.is_empty());
260
261            ErrorKind::InvalidIndexFile(names)
262        };
263
264        index_names_to_try
265            .iter()
266            .find_map(|filename| {
267                // for some special index file names, we accept directories as well
268                // as files.
269                let exists = match filename.as_str() {
270                    "." => Path::exists,
271                    _ => Path::is_file,
272                };
273
274                let path = dir_path.join(filename);
275                exists(&path).then_some(path)
276            })
277            .ok_or_else(invalid_index_error)
278    }
279
280    /// Checks a resolved file, optionally verifying fragments for HTML files.
281    ///
282    /// # Arguments
283    ///
284    /// * `path` - The resolved path to check.
285    /// * `uri` - The original URI, used for error reporting.
286    ///
287    /// # Returns
288    ///
289    /// Returns a `Status` indicating the result of the check.
290    async fn check_file(&self, path: &Path, uri: &Uri) -> Status {
291        if self.include_fragments {
292            self.check_fragment(path, uri).await
293        } else {
294            Status::Ok(StatusCode::OK)
295        }
296    }
297
298    /// Checks for the existence of a fragment in a path.
299    ///
300    /// The given path may be a file or a directory. A directory
301    /// is treated as if it was an empty file with no fragments.
302    ///
303    /// # Arguments
304    ///
305    /// * `path` - The path to the file or directory. Assumed to exist.
306    /// * `uri` - The original URI, containing the fragment to check.
307    ///
308    /// # Returns
309    ///
310    /// Returns a `Status` indicating the result of the fragment check.
311    async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
312        // for absent or trivial fragments, always return success.
313        if uri.url.fragment().is_none_or(str::is_empty) {
314            return Status::Ok(StatusCode::OK);
315        }
316
317        // directories are treated as if they were a file with no fragments.
318        // reaching here means we have a non-trivial fragment on a directory,
319        // so return error.
320        if path.is_dir() {
321            return ErrorKind::InvalidFragment(uri.clone()).into();
322        }
323
324        match FragmentInput::from_path(path).await {
325            Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
326                Ok(true) => Status::Ok(StatusCode::OK),
327                Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
328                Err(err) => {
329                    warn!("Skipping fragment check for {uri} due to the following error: {err}");
330                    Status::Ok(StatusCode::OK)
331                }
332            },
333            Err(err) => {
334                warn!("Skipping fragment check for {uri} due to the following error: {err}");
335                Status::Ok(StatusCode::OK)
336            }
337        }
338    }
339}
340
341#[cfg(test)]
342mod tests {
343    use super::FileChecker;
344    use crate::{
345        ErrorKind::{InvalidFilePath, InvalidFragment, InvalidIndexFile},
346        Status, Uri,
347    };
348    use test_utils::{fixture_uri, fixtures_path};
349
350    /// Calls [`FileChecker::check`] on the given [`FileChecker`] with given URL
351    /// path (relative to the fixtures directory).
352    ///
353    /// The result of checking the link is matched against the given pattern.
354    macro_rules! assert_filecheck {
355        ($checker:expr, $path:expr, $pattern:pat) => {
356            let uri = Uri::from(fixture_uri!($path));
357            let result = $checker.check(&uri).await;
358            assert!(
359                matches!(result, $pattern),
360                "assertion failed: {} should be {} but was '{:?}'",
361                &uri,
362                stringify!($pattern),
363                &result
364            );
365        };
366    }
367
368    /// Calls [`FileChecker::resolve_local_path`] on the given [`FileChecker`]
369    /// with given URL path (relative to the fixtures directory).
370    ///
371    /// The result of resolving the link is matched against the given pattern.
372    /// The pattern should match values of type `Result<&str, ErrorKind>`.
373    macro_rules! assert_resolves {
374        ($checker:expr, $subpath:expr, $expected:pat) => {
375            let uri = Uri::from(fixture_uri!($subpath));
376            let path = uri
377                .url
378                .to_file_path()
379                .expect("fixture uri should be a valid path");
380            let result = $checker.resolve_local_path(&path, &uri);
381            let result_subpath = result
382                .as_deref()
383                .map(|p| p.strip_prefix(fixtures_path!()).unwrap())
384                .map(|p| p.to_string_lossy());
385            assert!(
386                matches!(result_subpath.as_deref(), $expected),
387                "{:?} resolved to {:?} but should be {}",
388                $subpath,
389                result_subpath,
390                stringify!($expected)
391            );
392        };
393    }
394
395    #[tokio::test]
396    async fn test_default() {
397        // default behaviour accepts dir links as long as the directory exists.
398        let checker = FileChecker::new(None, vec![], None, true, false).unwrap();
399
400        assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_));
401
402        // empty dir is accepted with '.' in index_files, but it contains no fragments.
403        assert_resolves!(
404            &checker,
405            "filechecker/empty_dir",
406            Ok("filechecker/empty_dir")
407        );
408        assert_filecheck!(&checker, "filechecker/empty_dir", Status::Ok(_));
409        assert_filecheck!(&checker, "filechecker/empty_dir#", Status::Ok(_));
410        assert_filecheck!(
411            &checker,
412            "filechecker/empty_dir#fragment",
413            Status::Error(InvalidFragment(_))
414        );
415
416        // even though index.html is present, it is not used because index_files is only
417        // '.', so no fragments are found.
418        assert_resolves!(
419            &checker,
420            "filechecker/index_dir",
421            Ok("filechecker/index_dir")
422        );
423        assert_filecheck!(
424            &checker,
425            "filechecker/index_dir#fragment",
426            Status::Error(InvalidFragment(_))
427        );
428        assert_filecheck!(
429            &checker,
430            "filechecker/index_dir#non-existingfragment",
431            Status::Error(InvalidFragment(_))
432        );
433
434        assert_filecheck!(&checker, "filechecker/same_name", Status::Ok(_));
435
436        // because no fallback extensions are configured
437        assert_resolves!(
438            &checker,
439            "filechecker/same_name",
440            Ok("filechecker/same_name")
441        );
442        assert_filecheck!(
443            &checker,
444            "filechecker/same_name#a",
445            Status::Error(InvalidFragment(_))
446        );
447    }
448
449    #[tokio::test]
450    async fn test_index_files() {
451        let checker = FileChecker::new(
452            None,
453            vec![],
454            Some(vec!["index.html".to_owned(), "index.md".to_owned()]),
455            true,
456            false,
457        )
458        .unwrap();
459
460        assert_resolves!(
461            &checker,
462            "filechecker/index_dir",
463            Ok("filechecker/index_dir/index.html")
464        );
465        assert_resolves!(
466            &checker,
467            "filechecker/index_md",
468            Ok("filechecker/index_md/index.md")
469        );
470        // empty is rejected because of no index.html
471        assert_resolves!(&checker, "filechecker/empty_dir", Err(InvalidIndexFile(_)));
472
473        // index.html is resolved and fragments are checked.
474        assert_filecheck!(&checker, "filechecker/index_dir#fragment", Status::Ok(_));
475        assert_filecheck!(
476            &checker,
477            "filechecker/index_dir#non-existingfragment",
478            Status::Error(InvalidFragment(_))
479        );
480
481        // directories which look like files should still have index files applied
482        assert_resolves!(
483            &checker,
484            "filechecker/dir_with_extension.html",
485            Err(InvalidIndexFile(_))
486        );
487    }
488
489    #[tokio::test]
490    async fn test_both_fallback_and_index_corner() {
491        let checker = FileChecker::new(
492            None,
493            vec!["html".to_owned()],
494            Some(vec!["index".to_owned()]),
495            false,
496            false,
497        )
498        .unwrap();
499
500        // this test case has a subdir 'same_name' and a file 'same_name.html'.
501        // this shows that the index file resolving is applied in this case and
502        // fallback extensions are not applied.
503        assert_resolves!(&checker, "filechecker/same_name", Err(InvalidIndexFile(_)));
504
505        // this directory has an index.html, but the index_files argument is only "index". this
506        // shows that fallback extensions are not applied to index file names, as the index.html is
507        // not found.
508        assert_resolves!(&checker, "filechecker/index_dir", Err(InvalidIndexFile(_)));
509
510        // a directory called 'dir_with_extension.html' exists. this test shows that fallback
511        // extensions must resolve to a file not a directory.
512        assert_resolves!(
513            &checker,
514            "filechecker/dir_with_extension",
515            Err(InvalidFilePath(_))
516        );
517    }
518
519    #[tokio::test]
520    async fn test_empty_index_list_corner() {
521        // empty index_files list will reject all directory links
522        let checker_no_indexes =
523            FileChecker::new(None, vec![], Some(vec![]), false, false).unwrap();
524        assert_resolves!(
525            &checker_no_indexes,
526            "filechecker/index_dir",
527            Err(InvalidIndexFile(_))
528        );
529        assert_resolves!(
530            &checker_no_indexes,
531            "filechecker/empty_dir",
532            Err(InvalidIndexFile(_))
533        );
534    }
535
536    #[tokio::test]
537    async fn test_index_list_of_directories_corner() {
538        // this test defines index_files to be a list of different names, all of which will
539        // resolve to an existing directory. however, because they are directories and not
540        // the special '.' name, these should not be accepted as valid index files.
541        let dir_names = vec![
542            String::new(),
543            "./.".to_owned(),
544            "..".to_owned(),
545            "/".to_owned(),
546        ];
547        let checker_dir_indexes =
548            FileChecker::new(None, vec![], Some(dir_names), false, false).unwrap();
549        assert_resolves!(
550            &checker_dir_indexes,
551            "filechecker/index_dir",
552            Err(InvalidIndexFile(_))
553        );
554        assert_resolves!(
555            &checker_dir_indexes,
556            "filechecker/empty_dir",
557            Err(InvalidIndexFile(_))
558        );
559    }
560
561    #[tokio::test]
562    async fn test_index_file_traversal_corner() {
563        // index file names can contain path fragments and they will be traversed.
564        let checker_dotdot = FileChecker::new(
565            None,
566            vec![],
567            Some(vec!["../index_dir/index.html".to_owned()]),
568            true,
569            false,
570        )
571        .unwrap();
572        assert_resolves!(
573            &checker_dotdot,
574            "filechecker/empty_dir#fragment",
575            Ok("filechecker/empty_dir/../index_dir/index.html")
576        );
577
578        // absolute paths to a file on disk should also work
579        let absolute_html = fixtures_path!()
580            .join("filechecker/index_dir/index.html")
581            .to_str()
582            .expect("expected utf-8 fixtures path")
583            .to_owned();
584        let checker_absolute =
585            FileChecker::new(None, vec![], Some(vec![absolute_html]), true, false).unwrap();
586        assert_resolves!(
587            &checker_absolute,
588            "filechecker/empty_dir#fragment",
589            Ok("filechecker/index_dir/index.html")
590        );
591    }
592
593    #[tokio::test]
594    async fn test_fallback_extensions_on_directories() {
595        let checker = FileChecker::new(None, vec!["html".to_owned()], None, true, false).unwrap();
596
597        // fallback extensions should be applied when directory links are resolved
598        // to directories (i.e., the default index_files behavior or if `.`
599        // appears in index_files).
600        assert_resolves!(
601            &checker,
602            "filechecker/same_name#a",
603            Ok("filechecker/same_name.html")
604        );
605
606        // currently, trailing slashes are ignored and fallback extensions are
607        // applied regardless. maybe links with trailing slash should be prevented
608        // from resolving to files.
609        assert_resolves!(
610            &checker,
611            "filechecker/same_name/",
612            Ok("filechecker/same_name.html")
613        );
614    }
615}