1use crate::{
2 BasicAuthCredentials, ErrorKind, FileType, Status, Uri,
3 chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
4 quirks::Quirks,
5 ratelimit::HostPool,
6 retry::RetryExt,
7 types::{redirect_history::RedirectHistory, uri::github::GithubUri},
8 utils::fragment_checker::{FragmentChecker, FragmentInput},
9};
10use async_trait::async_trait;
11use http::{Method, StatusCode};
12use octocrab::Octocrab;
13use reqwest::{Request, header::CONTENT_TYPE};
14use std::{borrow::Cow, collections::HashSet, path::Path, sync::Arc, time::Duration};
15use url::Url;
16
17#[derive(Debug, Clone)]
18pub(crate) struct WebsiteChecker {
19 method: reqwest::Method,
21
22 github_client: Option<Octocrab>,
24
25 plugin_request_chain: RequestChain,
27
28 max_retries: u64,
30
31 retry_wait_time: Duration,
34
35 accepted: HashSet<StatusCode>,
39
40 require_https: bool,
44
45 include_fragments: bool,
49
50 fragment_checker: FragmentChecker,
52
53 redirect_history: RedirectHistory,
55
56 host_pool: Arc<HostPool>,
61}
62
63impl WebsiteChecker {
64 #[must_use]
66 pub(crate) fn host_pool(&self) -> Arc<HostPool> {
67 self.host_pool.clone()
68 }
69
70 #[allow(clippy::too_many_arguments)]
71 pub(crate) fn new(
72 method: reqwest::Method,
73 retry_wait_time: Duration,
74 redirect_history: RedirectHistory,
75 max_retries: u64,
76 accepted: HashSet<StatusCode>,
77 github_client: Option<Octocrab>,
78 require_https: bool,
79 plugin_request_chain: RequestChain,
80 include_fragments: bool,
81 host_pool: Arc<HostPool>,
82 ) -> Self {
83 Self {
84 method,
85 github_client,
86 plugin_request_chain,
87 redirect_history,
88 max_retries,
89 retry_wait_time,
90 accepted,
91 require_https,
92 include_fragments,
93 fragment_checker: FragmentChecker::new(),
94 host_pool,
95 }
96 }
97
98 pub(crate) async fn retry_request(&self, request: Request) -> Status {
103 let mut retries: u64 = 0;
104 let mut wait_time = self.retry_wait_time;
105 let mut status = self.check_default(clone_unwrap(&request)).await;
106 while retries < self.max_retries {
107 if status.is_success() || !status.should_retry() {
108 return status;
109 }
110 retries += 1;
111 tokio::time::sleep(wait_time).await;
112 wait_time = wait_time.saturating_mul(2);
113 status = self.check_default(clone_unwrap(&request)).await;
114 }
115
116 status
117 }
118
119 async fn check_default(&self, request: Request) -> Status {
121 let method = request.method().clone();
122 let request_url = request.url().clone();
123
124 let check_request_fragments = self.include_fragments
125 && method == Method::GET
126 && request_url.fragment().is_some_and(|x| !x.is_empty());
127
128 match self
129 .host_pool
130 .execute_request(request, check_request_fragments)
131 .await
132 {
133 Ok(response) => {
134 let status = Status::new(&response, &self.accepted);
135 if let Some(content) = response.text
138 && check_request_fragments
139 && response.status.is_success()
140 {
141 let Some(content_type) = response
142 .headers
143 .get(CONTENT_TYPE)
144 .and_then(|header| header.to_str().ok())
145 else {
146 return status;
147 };
148
149 let file_type = match content_type {
150 ct if ct.starts_with("text/html") => FileType::Html,
151 ct if ct.starts_with("text/markdown") => FileType::Markdown,
152 ct if ct.starts_with("text/plain") => {
153 let path = Path::new(response.url.path());
154 match path.extension() {
155 Some(ext) if ext.eq_ignore_ascii_case("md") => FileType::Markdown,
156 _ => return status,
157 }
158 }
159 _ => return status,
160 };
161
162 self.check_html_fragment(request_url, status, &content, file_type)
163 .await
164 } else {
165 status
166 }
167 }
168 Err(e) => e.into(),
169 }
170 }
171
172 async fn check_html_fragment(
173 &self,
174 url: Url,
175 status: Status,
176 content: &str,
177 file_type: FileType,
178 ) -> Status {
179 match self
180 .fragment_checker
181 .check(
182 FragmentInput {
183 content: Cow::Borrowed(content),
184 file_type,
185 },
186 &url,
187 )
188 .await
189 {
190 Ok(true) => status,
191 Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.into())),
192 Err(e) => Status::Error(e),
193 }
194 }
195
196 pub(crate) async fn check_website(
206 &self,
207 uri: &Uri,
208 credentials: Option<BasicAuthCredentials>,
209 ) -> Result<Status, ErrorKind> {
210 let default_chain: RequestChain = Chain::new(vec![
211 Box::<Quirks>::default(),
212 Box::new(credentials),
213 Box::new(self.clone()),
214 ]);
215
216 let status = self.check_website_inner(uri, &default_chain).await;
217 let status = self
218 .handle_insecure_url(uri, &default_chain, status)
219 .await?;
220 Ok(self.redirect_history.handle_redirected(&uri.url, status))
221 }
222
223 async fn handle_insecure_url(
226 &self,
227 uri: &Uri,
228 default_chain: &Chain<Request, Status>,
229 status: Status,
230 ) -> Result<Status, ErrorKind> {
231 if self.require_https
232 && uri.scheme() == "http"
233 && let Status::Ok(_) = status
234 {
235 let https_uri = uri.to_https()?;
236 let is_https_available = self
237 .check_website_inner(&https_uri, default_chain)
238 .await
239 .is_success();
240
241 if is_https_available {
242 return Ok(Status::Error(ErrorKind::InsecureURL(https_uri)));
243 }
244 }
245
246 Ok(status)
247 }
248
249 async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
262 let request = self.host_pool.build_request(self.method.clone(), uri);
263
264 let request = match request {
265 Ok(r) => r,
266 Err(e) => return e.into(),
267 };
268
269 let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
270 .traverse(request)
271 .await;
272
273 self.handle_github(status, uri).await
274 }
275
276 async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
280 if status.is_success() {
281 return status;
282 }
283
284 if let Ok(github_uri) = GithubUri::try_from(uri) {
285 let status = self.check_github(github_uri).await;
286 if status.is_success() {
287 return status;
288 }
289 }
290
291 status
292 }
293
294 async fn check_github(&self, uri: GithubUri) -> Status {
305 let Some(client) = &self.github_client else {
306 return ErrorKind::MissingGitHubToken.into();
307 };
308 let repo = match client.repos(&uri.owner, &uri.repo).get().await {
309 Ok(repo) => repo,
310 Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
311 };
312 if let Some(true) = repo.private {
313 return Status::Ok(StatusCode::OK);
314 } else if let Some(endpoint) = uri.endpoint {
315 return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
316 .into();
317 }
318 Status::Ok(StatusCode::OK)
319 }
320}
321
322fn clone_unwrap(request: &Request) -> Request {
332 request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
333}
334
335#[async_trait]
336impl Handler<Request, Status> for WebsiteChecker {
337 async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
338 ChainResult::Done(self.retry_request(input).await)
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 use std::{sync::Arc, time::Duration};
345
346 use http::Method;
347 use octocrab::Octocrab;
348
349 use crate::{
350 Uri,
351 chain::RequestChain,
352 checker::website::WebsiteChecker,
353 ratelimit::HostPool,
354 types::{
355 DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory,
356 uri::github::GithubUri,
357 },
358 };
359
360 #[tokio::test]
363 async fn test_github_client_integration() {
364 let client = Octocrab::builder().personal_token("dummy").build().unwrap();
365 let uri =
366 GithubUri::try_from(Uri::try_from("https://github.com/lycheeverse/lychee").unwrap())
367 .unwrap();
368
369 let status = get_checker(client).check_github(uri).await;
370
371 assert!(status.is_error());
374 }
375
376 fn get_checker(client: Octocrab) -> WebsiteChecker {
377 let host_pool = HostPool::default();
378 WebsiteChecker::new(
379 Method::GET,
380 Duration::ZERO,
381 RedirectHistory::new(),
382 0,
383 DEFAULT_ACCEPTED_STATUS_CODES.clone(),
384 Some(client),
385 false,
386 RequestChain::default(),
387 false,
388 Arc::new(host_pool),
389 )
390 }
391}