Skip to main content

lychee_lib/checker/
website.rs

1use crate::{
2    BasicAuthCredentials, ErrorKind, FileType, Status, Uri,
3    chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
4    quirks::Quirks,
5    ratelimit::HostPool,
6    retry::RetryExt,
7    types::{redirect_history::RedirectHistory, uri::github::GithubUri},
8    utils::fragment_checker::{FragmentChecker, FragmentInput},
9};
10use async_trait::async_trait;
11use http::{Method, StatusCode};
12use octocrab::Octocrab;
13use reqwest::{Request, header::CONTENT_TYPE};
14use std::{borrow::Cow, collections::HashSet, path::Path, sync::Arc, time::Duration};
15use url::Url;
16
17#[derive(Debug, Clone)]
18pub(crate) struct WebsiteChecker {
19    /// Request method used for making requests.
20    method: reqwest::Method,
21
22    /// GitHub client used for requests.
23    github_client: Option<Octocrab>,
24
25    /// The chain of plugins to be executed on each request.
26    plugin_request_chain: RequestChain,
27
28    /// Maximum number of retries per request before returning an error.
29    max_retries: u64,
30
31    /// Initial wait time between retries of failed requests. This doubles after
32    /// each failure.
33    retry_wait_time: Duration,
34
35    /// Set of accepted return codes / status codes.
36    ///
37    /// Unmatched return codes/ status codes are deemed as errors.
38    accepted: HashSet<StatusCode>,
39
40    /// Requires using HTTPS when it's available.
41    ///
42    /// This would treat unencrypted links as errors when HTTPS is available.
43    require_https: bool,
44
45    /// Whether to check the existence of fragments in the response HTML files.
46    ///
47    /// Will be disabled if the request method is `HEAD`.
48    include_fragments: bool,
49
50    /// Utility for performing fragment checks in HTML files.
51    fragment_checker: FragmentChecker,
52
53    /// Keep track of HTTP redirections for reporting
54    redirect_history: RedirectHistory,
55
56    /// Optional host pool for per-host rate limiting.
57    ///
58    /// When present, HTTP requests will be routed through this pool for
59    /// rate limiting. When None, requests go directly through `reqwest_client`.
60    host_pool: Arc<HostPool>,
61}
62
63impl WebsiteChecker {
64    /// Get a reference to `HostPool`
65    #[must_use]
66    pub(crate) fn host_pool(&self) -> Arc<HostPool> {
67        self.host_pool.clone()
68    }
69
70    #[allow(clippy::too_many_arguments)]
71    pub(crate) fn new(
72        method: reqwest::Method,
73        retry_wait_time: Duration,
74        redirect_history: RedirectHistory,
75        max_retries: u64,
76        accepted: HashSet<StatusCode>,
77        github_client: Option<Octocrab>,
78        require_https: bool,
79        plugin_request_chain: RequestChain,
80        include_fragments: bool,
81        host_pool: Arc<HostPool>,
82    ) -> Self {
83        Self {
84            method,
85            github_client,
86            plugin_request_chain,
87            redirect_history,
88            max_retries,
89            retry_wait_time,
90            accepted,
91            require_https,
92            include_fragments,
93            fragment_checker: FragmentChecker::new(),
94            host_pool,
95        }
96    }
97
98    /// Retry requests up to `max_retries` times
99    /// with an exponential backoff.
100    /// Note that, in addition, there also is a host-specific backoff
101    /// when host-specific rate limiting or errors are detected.
102    pub(crate) async fn retry_request(&self, request: Request) -> Status {
103        let mut retries: u64 = 0;
104        let mut wait_time = self.retry_wait_time;
105        let mut status = self.check_default(clone_unwrap(&request)).await;
106        while retries < self.max_retries {
107            if status.is_success() || !status.should_retry() {
108                return status;
109            }
110            retries += 1;
111            tokio::time::sleep(wait_time).await;
112            wait_time = wait_time.saturating_mul(2);
113            status = self.check_default(clone_unwrap(&request)).await;
114        }
115
116        status
117    }
118
119    /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
120    async fn check_default(&self, request: Request) -> Status {
121        let method = request.method().clone();
122        let request_url = request.url().clone();
123
124        let check_request_fragments = self.include_fragments
125            && method == Method::GET
126            && request_url.fragment().is_some_and(|x| !x.is_empty());
127
128        match self
129            .host_pool
130            .execute_request(request, check_request_fragments)
131            .await
132        {
133            Ok(response) => {
134                let status = Status::new(&response, &self.accepted);
135                // when `accept=200,429`, `status_code=429` will be treated as success
136                // but we are not able the check the fragment since it's inapplicable.
137                if let Some(content) = response.text
138                    && check_request_fragments
139                    && response.status.is_success()
140                {
141                    let Some(content_type) = response
142                        .headers
143                        .get(CONTENT_TYPE)
144                        .and_then(|header| header.to_str().ok())
145                    else {
146                        return status;
147                    };
148
149                    let file_type = match content_type {
150                        ct if ct.starts_with("text/html") => FileType::Html,
151                        ct if ct.starts_with("text/markdown") => FileType::Markdown,
152                        ct if ct.starts_with("text/plain") => {
153                            let path = Path::new(response.url.path());
154                            match path.extension() {
155                                Some(ext) if ext.eq_ignore_ascii_case("md") => FileType::Markdown,
156                                _ => return status,
157                            }
158                        }
159                        _ => return status,
160                    };
161
162                    self.check_html_fragment(request_url, status, &content, file_type)
163                        .await
164                } else {
165                    status
166                }
167            }
168            Err(e) => e.into(),
169        }
170    }
171
172    async fn check_html_fragment(
173        &self,
174        url: Url,
175        status: Status,
176        content: &str,
177        file_type: FileType,
178    ) -> Status {
179        match self
180            .fragment_checker
181            .check(
182                FragmentInput {
183                    content: Cow::Borrowed(content),
184                    file_type,
185                },
186                &url,
187            )
188            .await
189        {
190            Ok(true) => status,
191            Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.into())),
192            Err(e) => Status::Error(e),
193        }
194    }
195
196    /// Checks the given URI of a website.
197    ///
198    /// # Errors
199    ///
200    /// This returns an `Err` if
201    /// - The URI is invalid.
202    /// - The request failed.
203    /// - The response status code is not accepted.
204    /// - The URI cannot be converted to HTTPS.
205    pub(crate) async fn check_website(
206        &self,
207        uri: &Uri,
208        credentials: Option<BasicAuthCredentials>,
209    ) -> Result<Status, ErrorKind> {
210        let default_chain: RequestChain = Chain::new(vec![
211            Box::<Quirks>::default(),
212            Box::new(credentials),
213            Box::new(self.clone()),
214        ]);
215
216        let status = self.check_website_inner(uri, &default_chain).await;
217        let status = self
218            .handle_insecure_url(uri, &default_chain, status)
219            .await?;
220        Ok(self.redirect_history.handle_redirected(&uri.url, status))
221    }
222
223    /// Mark HTTP URLs as insecure, if the user required HTTPS
224    /// and the URL is available under HTTPS.
225    async fn handle_insecure_url(
226        &self,
227        uri: &Uri,
228        default_chain: &Chain<Request, Status>,
229        status: Status,
230    ) -> Result<Status, ErrorKind> {
231        if self.require_https
232            && uri.scheme() == "http"
233            && let Status::Ok(_) = status
234        {
235            let https_uri = uri.to_https()?;
236            let is_https_available = self
237                .check_website_inner(&https_uri, default_chain)
238                .await
239                .is_success();
240
241            if is_https_available {
242                return Ok(Status::Error(ErrorKind::InsecureURL(https_uri)));
243            }
244        }
245
246        Ok(status)
247    }
248
249    /// Checks the given URI of a website.
250    ///
251    /// Unsupported schemes will be ignored
252    ///
253    /// Note: we use `inner` to improve compile times by avoiding monomorphization
254    ///
255    /// # Errors
256    ///
257    /// This returns an `Err` if
258    /// - The URI is invalid.
259    /// - The request failed.
260    /// - The response status code is not accepted.
261    async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
262        let request = self.host_pool.build_request(self.method.clone(), uri);
263
264        let request = match request {
265            Ok(r) => r,
266            Err(e) => return e.into(),
267        };
268
269        let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
270            .traverse(request)
271            .await;
272
273        self.handle_github(status, uri).await
274    }
275
276    // Pull out the heavy machinery in case of a failed normal request.
277    // This could be a GitHub URL and we ran into the rate limiter.
278    // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
279    async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
280        if status.is_success() {
281            return status;
282        }
283
284        if let Ok(github_uri) = GithubUri::try_from(uri) {
285            let status = self.check_github(github_uri).await;
286            if status.is_success() {
287                return status;
288            }
289        }
290
291        status
292    }
293
294    /// Check a `uri` hosted on `GitHub` via the GitHub API.
295    ///
296    /// # Caveats
297    ///
298    /// Files inside private repositories won't get checked and instead would
299    /// be reported as valid if the repository itself is reachable through the
300    /// API.
301    ///
302    /// A better approach would be to download the file through the API or
303    /// clone the repo, but we chose the pragmatic approach.
304    async fn check_github(&self, uri: GithubUri) -> Status {
305        let Some(client) = &self.github_client else {
306            return ErrorKind::MissingGitHubToken.into();
307        };
308        let repo = match client.repos(&uri.owner, &uri.repo).get().await {
309            Ok(repo) => repo,
310            Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
311        };
312        if let Some(true) = repo.private {
313            return Status::Ok(StatusCode::OK);
314        } else if let Some(endpoint) = uri.endpoint {
315            return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
316                .into();
317        }
318        Status::Ok(StatusCode::OK)
319    }
320}
321
322/// Clones a `reqwest::Request`.
323///
324/// # Safety
325///
326/// This panics if the request cannot be cloned. This should only happen if the
327/// request body is a `reqwest` stream. We disable the `stream` feature, so the
328/// body should never be a stream.
329///
330/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
331fn clone_unwrap(request: &Request) -> Request {
332    request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
333}
334
335#[async_trait]
336impl Handler<Request, Status> for WebsiteChecker {
337    async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
338        ChainResult::Done(self.retry_request(input).await)
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use std::{sync::Arc, time::Duration};
345
346    use http::Method;
347    use octocrab::Octocrab;
348
349    use crate::{
350        Uri,
351        chain::RequestChain,
352        checker::website::WebsiteChecker,
353        ratelimit::HostPool,
354        types::{
355            DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory,
356            uri::github::GithubUri,
357        },
358    };
359
360    /// Test GitHub client integration.
361    /// This prevents a regression of <https://github.com/lycheeverse/lychee/issues/2024>
362    #[tokio::test]
363    async fn test_github_client_integration() {
364        let client = Octocrab::builder().personal_token("dummy").build().unwrap();
365        let uri =
366            GithubUri::try_from(Uri::try_from("https://github.com/lycheeverse/lychee").unwrap())
367                .unwrap();
368
369        let status = get_checker(client).check_github(uri).await;
370
371        // Because of the invalid authentication token the request failed.
372        // But we proved how we could build a client and perform a request.
373        assert!(status.is_error());
374    }
375
376    fn get_checker(client: Octocrab) -> WebsiteChecker {
377        let host_pool = HostPool::default();
378        WebsiteChecker::new(
379            Method::GET,
380            Duration::ZERO,
381            RedirectHistory::new(),
382            0,
383            DEFAULT_ACCEPTED_STATUS_CODES.clone(),
384            Some(client),
385            false,
386            RequestChain::default(),
387            false,
388            Arc::new(host_pool),
389        )
390    }
391}