Skip to main content

lychee_lib/utils/
request.rs

1use percent_encoding::percent_decode_str;
2use reqwest::Url;
3use std::collections::HashSet;
4use std::path::{Path, PathBuf};
5
6use crate::{
7    Base, BasicAuthCredentials, ErrorKind, LycheeResult, Request, RequestError, Uri,
8    basic_auth::BasicAuthExtractor,
9    types::{ResolvedInputSource, uri::raw::RawUri},
10    utils::{path, url},
11};
12
13/// Extract basic auth credentials for a given URL.
14pub(crate) fn extract_credentials(
15    extractor: Option<&BasicAuthExtractor>,
16    uri: &Uri,
17) -> Option<BasicAuthCredentials> {
18    extractor.as_ref().and_then(|ext| ext.matches(uri))
19}
20
21/// Create a request from a raw URI.
22fn create_request(
23    raw_uri: &RawUri,
24    source: &ResolvedInputSource,
25    root_dir: Option<&PathBuf>,
26    base: Option<&Base>,
27    extractor: Option<&BasicAuthExtractor>,
28) -> LycheeResult<Request> {
29    let uri = try_parse_into_uri(raw_uri, source, root_dir, base)?;
30    let source = source.clone();
31    let element = raw_uri.element.clone();
32    let attribute = raw_uri.attribute.clone();
33    let credentials = extract_credentials(extractor, &uri);
34
35    Ok(Request::new(uri, source, element, attribute, credentials))
36}
37
38/// Try to parse the raw URI into a `Uri`.
39///
40/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
41/// If the base URL is not available, create a URI from the file path.
42///
43/// # Errors
44///
45/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
46///   to create a valid URI.
47/// - If a URI cannot be created from the file path.
48/// - If the source is not a file path (i.e. the URI type is not supported).
49fn try_parse_into_uri(
50    raw_uri: &RawUri,
51    source: &ResolvedInputSource,
52    root_dir: Option<&PathBuf>,
53    base: Option<&Base>,
54) -> LycheeResult<Uri> {
55    let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir);
56    let uri = match Uri::try_from(raw_uri.clone()) {
57        Ok(uri) => uri,
58        Err(_) => match base {
59            Some(base_url) => match base_url.join(&text) {
60                Some(url) => Uri { url },
61                None => return Err(ErrorKind::InvalidBaseJoin(text.clone())),
62            },
63            None => match source {
64                ResolvedInputSource::FsPath(root) => {
65                    create_uri_from_file_path(root, &text, root_dir.is_none())?
66                }
67                _ => return Err(ErrorKind::UnsupportedUriType(text)),
68            },
69        },
70    };
71    Ok(uri)
72}
73
74// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
75pub(crate) fn is_anchor(text: &str) -> bool {
76    text.starts_with('#')
77}
78
79/// Create a URI from a file path
80///
81/// # Errors
82///
83/// - If the link text is an anchor and the file name cannot be extracted from the file path.
84/// - If the path cannot be resolved.
85/// - If the resolved path cannot be converted to a URL.
86fn create_uri_from_file_path(
87    file_path: &Path,
88    link_text: &str,
89    ignore_absolute_local_links: bool,
90) -> LycheeResult<Uri> {
91    let target_path = if is_anchor(link_text) {
92        // For anchors, we need to append the anchor to the file name.
93        let file_name = file_path
94            .file_name()
95            .and_then(|name| name.to_str())
96            .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?;
97
98        format!("{file_name}{link_text}")
99    } else {
100        link_text.to_string()
101    };
102    let Ok(constructed_url) =
103        resolve_and_create_url(file_path, &target_path, ignore_absolute_local_links)
104    else {
105        return Err(ErrorKind::InvalidPathToUri(target_path));
106    };
107    Ok(Uri {
108        url: constructed_url,
109    })
110}
111
112/// Create requests out of the collected URLs.
113/// Returns a vector of valid URLs and errors. Valid URLs are deduplicated,
114/// request errors are not deduplicated.
115///
116/// If a URLs is ignored (because of the current settings),
117/// it will not be added to the results.
118pub(crate) fn create(
119    uris: Vec<RawUri>,
120    source: &ResolvedInputSource,
121    root_dir: Option<&PathBuf>,
122    base: Option<&Base>,
123    extractor: Option<&BasicAuthExtractor>,
124) -> Vec<Result<Request, RequestError>> {
125    let base = base.cloned().or_else(|| Base::from_source(source));
126
127    let mut requests = HashSet::<Request>::new();
128    let mut errors = Vec::<RequestError>::new();
129
130    for raw_uri in uris {
131        let result = create_request(&raw_uri, source, root_dir, base.as_ref(), extractor);
132        match result {
133            Ok(request) => {
134                requests.insert(request);
135            }
136            Err(e) => errors.push(RequestError::CreateRequestItem(
137                raw_uri.clone(),
138                source.clone(),
139                e,
140            )),
141        }
142    }
143
144    (requests.into_iter().map(Result::Ok))
145        .chain(errors.into_iter().map(Result::Err))
146        .collect()
147}
148
149/// Create a URI from a path
150///
151/// `src_path` is the path of the source file.
152/// `dest_path` is the path being linked to.
153/// The optional `base_uri` specifies the base URI to resolve the destination path against.
154///
155/// # Errors
156///
157/// - If the percent-decoded destination path cannot be decoded as UTF-8.
158/// - The path cannot be resolved
159/// - The resolved path cannot be converted to a URL.
160fn resolve_and_create_url(
161    src_path: &Path,
162    dest_path: &str,
163    ignore_absolute_local_links: bool,
164) -> LycheeResult<Url> {
165    let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
166
167    // Decode the destination path to avoid double-encoding
168    // This addresses the issue mentioned in the original comment about double-encoding
169    let decoded_dest = percent_decode_str(dest_path).decode_utf8()?;
170
171    let Ok(Some(resolved_path)) = path::resolve(
172        src_path,
173        &PathBuf::from(&*decoded_dest),
174        ignore_absolute_local_links,
175    ) else {
176        return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string()));
177    };
178
179    let Ok(mut url) = Url::from_file_path(&resolved_path) else {
180        return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone()));
181    };
182
183    url.set_fragment(fragment);
184    Ok(url)
185}
186
187fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf>) -> String {
188    if text.starts_with('/')
189        && let Some(path) = root_dir
190        && let Some(path_str) = path.to_str()
191    {
192        return format!("{path_str}{text}");
193    }
194    text.to_string()
195}
196
197#[cfg(test)]
198mod tests {
199    use std::borrow::Cow;
200    use std::num::NonZeroUsize;
201
202    use crate::types::uri::raw::RawUriSpan;
203
204    use super::*;
205
206    /// Create requests from the given raw URIs and returns requests that were
207    /// constructed successfully, silently ignoring link parsing errors.
208    ///
209    /// This reduces the `Result` handling which is needed in test cases. Test
210    /// cases can still detect the unexpected appearance of errors by the
211    /// length being different.
212    fn create_ok_only(
213        uris: Vec<RawUri>,
214        source: &ResolvedInputSource,
215        root_dir: Option<&PathBuf>,
216        base: Option<&Base>,
217        extractor: Option<&BasicAuthExtractor>,
218    ) -> Vec<Request> {
219        create(uris, source, root_dir, base, extractor)
220            .into_iter()
221            .filter_map(Result::ok)
222            .collect()
223    }
224
225    fn raw_uri(text: &'static str) -> RawUri {
226        RawUri {
227            text: text.to_string(),
228            element: None,
229            attribute: None,
230            span: RawUriSpan {
231                line: NonZeroUsize::MAX,
232                column: None,
233            },
234        }
235    }
236
237    #[test]
238    fn test_is_anchor() {
239        assert!(is_anchor("#anchor"));
240        assert!(!is_anchor("notan#anchor"));
241    }
242
243    #[test]
244    fn test_create_uri_from_path() {
245        let result =
246            resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", true).unwrap();
247        assert_eq!(result.as_str(), "file:///test+encoding");
248    }
249
250    #[test]
251    fn test_relative_url_resolution() {
252        let base = Base::try_from("https://example.com/path/page.html").unwrap();
253        let source = ResolvedInputSource::String(Cow::Borrowed(""));
254
255        let uris = vec![raw_uri("relative.html")];
256        let requests = create_ok_only(uris, &source, None, Some(&base), None);
257
258        assert_eq!(requests.len(), 1);
259        assert!(
260            requests
261                .iter()
262                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
263        );
264    }
265
266    #[test]
267    fn test_absolute_url_resolution() {
268        let base = Base::try_from("https://example.com/path/page.html").unwrap();
269        let source = ResolvedInputSource::String(Cow::Borrowed(""));
270
271        let uris = vec![raw_uri("https://another.com/page")];
272        let requests = create_ok_only(uris, &source, None, Some(&base), None);
273
274        assert_eq!(requests.len(), 1);
275        assert!(
276            requests
277                .iter()
278                .any(|r| r.uri.url.as_str() == "https://another.com/page")
279        );
280    }
281
282    #[test]
283    fn test_root_relative_url_resolution() {
284        let base = Base::try_from("https://example.com/path/page.html").unwrap();
285        let source = ResolvedInputSource::String(Cow::Borrowed(""));
286
287        let uris = vec![raw_uri("/root-relative")];
288        let requests = create_ok_only(uris, &source, None, Some(&base), None);
289
290        assert_eq!(requests.len(), 1);
291        assert!(
292            requests
293                .iter()
294                .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")
295        );
296    }
297
298    #[test]
299    fn test_parent_directory_url_resolution() {
300        let base = Base::try_from("https://example.com/path/page.html").unwrap();
301        let source = ResolvedInputSource::String(Cow::Borrowed(""));
302
303        let uris = vec![raw_uri("../parent")];
304        let requests = create_ok_only(uris, &source, None, Some(&base), None);
305
306        assert_eq!(requests.len(), 1);
307        assert!(
308            requests
309                .iter()
310                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
311        );
312    }
313
314    #[test]
315    fn test_fragment_url_resolution() {
316        let base = Base::try_from("https://example.com/path/page.html").unwrap();
317        let source = ResolvedInputSource::String(Cow::Borrowed(""));
318
319        let uris = vec![raw_uri("#fragment")];
320        let requests = create_ok_only(uris, &source, None, Some(&base), None);
321
322        assert_eq!(requests.len(), 1);
323        assert!(
324            requests
325                .iter()
326                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
327        );
328    }
329
330    #[test]
331    fn test_relative_url_resolution_from_root_dir() {
332        let root_dir = PathBuf::from("/tmp/lychee");
333        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
334
335        let uris = vec![raw_uri("relative.html")];
336        let requests = create_ok_only(uris, &source, Some(&root_dir), None, None);
337
338        assert_eq!(requests.len(), 1);
339        assert!(
340            requests
341                .iter()
342                .any(|r| r.uri.url.as_str() == "file:///some/relative.html")
343        );
344    }
345
346    #[test]
347    fn test_absolute_url_resolution_from_root_dir() {
348        let root_dir = PathBuf::from("/tmp/lychee");
349        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
350
351        let uris = vec![raw_uri("https://another.com/page")];
352        let requests = create_ok_only(uris, &source, Some(&root_dir), None, None);
353
354        assert_eq!(requests.len(), 1);
355        assert!(
356            requests
357                .iter()
358                .any(|r| r.uri.url.as_str() == "https://another.com/page")
359        );
360    }
361
362    #[test]
363    fn test_root_relative_url_resolution_from_root_dir() {
364        let root_dir = PathBuf::from("/tmp/lychee");
365        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
366
367        let uris = vec![raw_uri("/root-relative")];
368        let requests = create_ok_only(uris, &source, Some(&root_dir), None, None);
369
370        assert_eq!(requests.len(), 1);
371        assert!(
372            requests
373                .iter()
374                .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/root-relative")
375        );
376    }
377
378    #[test]
379    fn test_parent_directory_url_resolution_from_root_dir() {
380        let root_dir = PathBuf::from("/tmp/lychee");
381        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
382
383        let uris = vec![raw_uri("../parent")];
384        let requests = create_ok_only(uris, &source, Some(&root_dir), None, None);
385
386        assert_eq!(requests.len(), 1);
387        assert!(
388            requests
389                .iter()
390                .any(|r| r.uri.url.as_str() == "file:///parent")
391        );
392    }
393
394    #[test]
395    fn test_fragment_url_resolution_from_root_dir() {
396        let root_dir = PathBuf::from("/tmp/lychee");
397        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
398
399        let uris = vec![raw_uri("#fragment")];
400        let requests = create_ok_only(uris, &source, Some(&root_dir), None, None);
401
402        assert_eq!(requests.len(), 1);
403        assert!(
404            requests
405                .iter()
406                .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment")
407        );
408    }
409
410    #[test]
411    fn test_relative_url_resolution_from_root_dir_and_base_url() {
412        let root_dir = PathBuf::from("/tmp/lychee");
413        let base = Base::try_from("https://example.com/path/page.html").unwrap();
414        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
415
416        let uris = vec![raw_uri("relative.html")];
417        let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None);
418
419        assert_eq!(requests.len(), 1);
420        assert!(
421            requests
422                .iter()
423                .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")
424        );
425    }
426
427    #[test]
428    fn test_absolute_url_resolution_from_root_dir_and_base_url() {
429        let root_dir = PathBuf::from("/tmp/lychee");
430        let base = Base::try_from("https://example.com/path/page.html").unwrap();
431        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
432
433        let uris = vec![raw_uri("https://another.com/page")];
434        let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None);
435
436        assert_eq!(requests.len(), 1);
437        assert!(
438            requests
439                .iter()
440                .any(|r| r.uri.url.as_str() == "https://another.com/page")
441        );
442    }
443
444    #[test]
445    fn test_root_relative_url_resolution_from_root_dir_and_base_url() {
446        let root_dir = PathBuf::from("/tmp/lychee");
447        let base = Base::try_from("https://example.com/path/page.html").unwrap();
448        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
449
450        let uris = vec![raw_uri("/root-relative")];
451        let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None);
452
453        assert_eq!(requests.len(), 1);
454        assert!(
455            requests
456                .iter()
457                .any(|r| r.uri.url.as_str() == "https://example.com/tmp/lychee/root-relative")
458        );
459    }
460
461    #[test]
462    fn test_parent_directory_url_resolution_from_root_dir_and_base_url() {
463        let root_dir = PathBuf::from("/tmp/lychee");
464        let base = Base::try_from("https://example.com/path/page.html").unwrap();
465        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
466
467        let uris = vec![raw_uri("../parent")];
468        let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None);
469
470        assert_eq!(requests.len(), 1);
471        assert!(
472            requests
473                .iter()
474                .any(|r| r.uri.url.as_str() == "https://example.com/parent")
475        );
476    }
477
478    #[test]
479    fn test_fragment_url_resolution_from_root_dir_and_base_url() {
480        let root_dir = PathBuf::from("/tmp/lychee");
481        let base = Base::try_from("https://example.com/path/page.html").unwrap();
482        let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html"));
483
484        let uris = vec![raw_uri("#fragment")];
485        let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None);
486
487        assert_eq!(requests.len(), 1);
488        assert!(
489            requests
490                .iter()
491                .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")
492        );
493    }
494
495    #[test]
496    fn test_no_base_url_resolution() {
497        let source = ResolvedInputSource::String(Cow::Borrowed(""));
498
499        let uris = vec![raw_uri("https://example.com/page")];
500        let requests = create_ok_only(uris, &source, None, None, None);
501
502        assert_eq!(requests.len(), 1);
503        assert!(
504            requests
505                .iter()
506                .any(|r| r.uri.url.as_str() == "https://example.com/page")
507        );
508    }
509
510    #[test]
511    fn test_create_request_from_relative_file_path() {
512        let base = Base::Local(PathBuf::from("/tmp/lychee"));
513        let input_source = ResolvedInputSource::FsPath(PathBuf::from("page.html"));
514
515        let actual = create_request(
516            &raw_uri("file.html"),
517            &input_source,
518            None,
519            Some(&base),
520            None,
521        )
522        .unwrap();
523
524        assert_eq!(
525            actual,
526            Request::new(
527                Uri {
528                    url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
529                },
530                input_source,
531                None,
532                None,
533                None,
534            )
535        );
536    }
537
538    #[test]
539    fn test_create_request_from_relative_file_path_errors() {
540        // relative links unsupported from stdin
541        assert!(
542            create_request(
543                &raw_uri("file.html"),
544                &ResolvedInputSource::Stdin,
545                None,
546                None,
547                None,
548            )
549            .is_err()
550        );
551
552        // error because no root-dir and no base-url
553        assert!(
554            create_request(
555                &raw_uri("/file.html"),
556                &ResolvedInputSource::FsPath(PathBuf::from("page.html")),
557                None,
558                None,
559                None,
560            )
561            .is_err()
562        );
563    }
564
565    #[test]
566    fn test_create_request_from_absolute_file_path() {
567        let base = Base::Local(PathBuf::from("/tmp/lychee"));
568        let input_source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html"));
569
570        // Use an absolute path that's outside the base directory
571        let actual = create_request(
572            &raw_uri("/usr/local/share/doc/example.html"),
573            &input_source,
574            None,
575            Some(&base),
576            None,
577        )
578        .unwrap();
579
580        assert_eq!(
581            actual,
582            Request::new(
583                Uri {
584                    url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap()
585                },
586                input_source,
587                None,
588                None,
589                None,
590            )
591        );
592    }
593
594    #[test]
595    fn test_parse_relative_path_into_uri() {
596        let base = Base::Local(PathBuf::from("/tmp/lychee"));
597        let source = ResolvedInputSource::String(Cow::Borrowed(""));
598
599        let raw_uri = raw_uri("relative.html");
600        let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap();
601
602        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
603    }
604
605    #[test]
606    fn test_parse_absolute_path_into_uri() {
607        let base = Base::Local(PathBuf::from("/tmp/lychee"));
608        let source = ResolvedInputSource::String(Cow::Borrowed(""));
609
610        let raw_uri = raw_uri("absolute.html");
611        let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap();
612
613        assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
614    }
615
616    #[test]
617    fn test_prepend_with_absolute_local_link_and_root_dir() {
618        let text = "/absolute/path";
619        let root_dir = PathBuf::from("/root");
620        let result = prepend_root_dir_if_absolute_local_link(text, Some(&root_dir));
621        assert_eq!(result, "/root/absolute/path");
622    }
623
624    #[test]
625    fn test_prepend_with_absolute_local_link_and_no_root_dir() {
626        let text = "/absolute/path";
627        let result = prepend_root_dir_if_absolute_local_link(text, None);
628        assert_eq!(result, "/absolute/path");
629    }
630
631    #[test]
632    fn test_prepend_with_relative_link_and_root_dir() {
633        let text = "relative/path";
634        let root_dir = PathBuf::from("/root");
635        let result = prepend_root_dir_if_absolute_local_link(text, Some(&root_dir));
636        assert_eq!(result, "relative/path");
637    }
638
639    #[test]
640    fn test_prepend_with_relative_link_and_no_root_dir() {
641        let text = "relative/path";
642        let result = prepend_root_dir_if_absolute_local_link(text, None);
643        assert_eq!(result, "relative/path");
644    }
645}