Skip to main content

lychee_lib/quirks/
mod.rs

1use crate::{
2    Status,
3    chain::{ChainResult, Handler},
4};
5use async_trait::async_trait;
6use header::HeaderValue;
7use http::header;
8use regex::{Captures, Regex};
9use reqwest::{Request, Url};
10use std::{collections::HashMap, sync::LazyLock};
11
12static CRATES_PATTERN: LazyLock<Regex> =
13    LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?crates.io").unwrap());
14static YOUTUBE_PATTERN: LazyLock<Regex> =
15    LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?youtube(-nocookie)?\.com").unwrap());
16static YOUTUBE_SHORT_PATTERN: LazyLock<Regex> =
17    LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?(youtu\.?be)").unwrap());
18static GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
19    Regex::new(r"^https://github\.com/(?<user>.*?)/(?<repo>.*?)/blob/(?<path>.*?)/(?<file>.*\.(md|markdown)#.*)$")
20        .unwrap()
21});
22
23// Retrieve a map of query params for the given request
24fn query(request: &Request) -> HashMap<String, String> {
25    request.url().query_pairs().into_owned().collect()
26}
27
28#[derive(Debug, Clone)]
29pub(crate) struct Quirk {
30    pub(crate) pattern: &'static LazyLock<Regex>,
31    pub(crate) rewrite: fn(Request, Captures) -> Request,
32}
33
34#[derive(Debug, Clone)]
35pub(crate) struct Quirks {
36    quirks: Vec<Quirk>,
37}
38
39impl Default for Quirks {
40    fn default() -> Self {
41        let quirks = vec![
42            Quirk {
43                pattern: &CRATES_PATTERN,
44                rewrite: |mut request, _| {
45                    request
46                        .headers_mut()
47                        .insert(header::ACCEPT, HeaderValue::from_static("text/html"));
48                    request
49                },
50            },
51            Quirk {
52                pattern: &YOUTUBE_PATTERN,
53                rewrite: |mut request, _| {
54                    // Extract video id if it's a video page
55                    let video_id = match request.url().path() {
56                        "/watch" => query(&request).get("v").map(ToOwned::to_owned),
57                        path if path.starts_with("/embed/") => {
58                            path.strip_prefix("/embed/").map(ToOwned::to_owned)
59                        }
60                        _ => return request,
61                    };
62
63                    // Only rewrite to thumbnail if we got a video id
64                    if let Some(id) = video_id {
65                        *request.url_mut() =
66                            Url::parse(&format!("https://img.youtube.com/vi/{id}/0.jpg")).unwrap();
67                    }
68
69                    request
70                },
71            },
72            Quirk {
73                pattern: &YOUTUBE_SHORT_PATTERN,
74                rewrite: |mut request, _| {
75                    // Short links use the path as video id
76                    let id = request.url().path().trim_start_matches('/');
77                    if id.is_empty() {
78                        return request;
79                    }
80                    *request.url_mut() =
81                        Url::parse(&format!("https://img.youtube.com/vi/{id}/0.jpg")).unwrap();
82                    request
83                },
84            },
85            Quirk {
86                pattern: &GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN,
87                rewrite: |mut request, captures| {
88                    let mut raw_url = String::new();
89                    captures.expand(
90                        "https://raw.githubusercontent.com/$user/$repo/$path/$file",
91                        &mut raw_url,
92                    );
93                    *request.url_mut() = Url::parse(&raw_url).unwrap();
94                    request
95                },
96            },
97        ];
98        Self { quirks }
99    }
100}
101
102impl Quirks {
103    /// Apply quirks to a given request. Only the first quirk regex pattern
104    /// matching the URL will be applied. The rest will be discarded for
105    /// simplicity reasons. This limitation might be lifted in the future.
106    pub(crate) fn apply(&self, request: Request) -> Request {
107        for quirk in &self.quirks {
108            if let Some(captures) = quirk.pattern.captures(request.url().clone().as_str()) {
109                return (quirk.rewrite)(request, captures);
110            }
111        }
112        // Request was not modified
113        request
114    }
115}
116
117#[async_trait]
118impl Handler<Request, Status> for Quirks {
119    async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
120        ChainResult::Next(self.apply(input))
121    }
122}
123
124#[cfg(test)]
125mod tests {
126    use header::HeaderValue;
127    use http::{Method, header};
128    use reqwest::{Request, Url};
129
130    use super::Quirks;
131
132    #[derive(Debug)]
133    struct MockRequest(Request);
134
135    impl MockRequest {
136        fn new(method: Method, url: Url) -> Self {
137            Self(Request::new(method, url))
138        }
139    }
140
141    impl PartialEq for MockRequest {
142        fn eq(&self, other: &Self) -> bool {
143            self.0.url() == other.0.url() && self.0.method() == other.0.method()
144        }
145    }
146
147    #[test]
148    fn test_cratesio_request() {
149        let url = Url::parse("https://crates.io/crates/lychee").unwrap();
150        let request = Request::new(Method::GET, url);
151        let modified = Quirks::default().apply(request);
152
153        assert_eq!(
154            modified.headers().get(header::ACCEPT).unwrap(),
155            HeaderValue::from_static("text/html")
156        );
157    }
158
159    #[test]
160    fn test_youtube_video_request() {
161        let url = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap();
162        let request = Request::new(Method::GET, url);
163        let modified = Quirks::default().apply(request);
164        let expected_url = Url::parse("https://img.youtube.com/vi/NlKuICiT470/0.jpg").unwrap();
165
166        assert_eq!(
167            MockRequest(modified),
168            MockRequest::new(Method::GET, expected_url)
169        );
170    }
171
172    #[test]
173    fn test_youtube_video_nocookie_request() {
174        let url = Url::parse("https://www.youtube-nocookie.com/embed/BIguvia6AvM").unwrap();
175        let request = Request::new(Method::GET, url);
176        let modified = Quirks::default().apply(request);
177        let expected_url = Url::parse("https://img.youtube.com/vi/BIguvia6AvM/0.jpg").unwrap();
178
179        assert_eq!(
180            MockRequest(modified),
181            MockRequest::new(Method::GET, expected_url)
182        );
183    }
184
185    #[test]
186    fn test_youtube_video_shortlink_request() {
187        let url = Url::parse("https://youtu.be/Rvu7N4wyFpk?t=42").unwrap();
188        let request = Request::new(Method::GET, url);
189        let modified = Quirks::default().apply(request);
190        let expected_url = Url::parse("https://img.youtube.com/vi/Rvu7N4wyFpk/0.jpg").unwrap();
191
192        assert_eq!(
193            MockRequest(modified),
194            MockRequest::new(Method::GET, expected_url)
195        );
196    }
197
198    #[test]
199    fn test_non_video_youtube_url_untouched() {
200        let url = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap();
201        let request = Request::new(Method::GET, url.clone());
202        let modified = Quirks::default().apply(request);
203
204        assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url));
205    }
206
207    #[test]
208    fn test_github_blob_markdown_fragment_request() {
209        let cases = [
210            (
211                "https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology",
212                "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.md#terminology",
213            ),
214            (
215                "https://github.com/moby/docker-image-spec/blob/main/spec.markdown#terminology",
216                "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.markdown#terminology",
217            ),
218            (
219                "https://github.com/moby/docker-image-spec/blob/main/spec.md",
220                "https://github.com/moby/docker-image-spec/blob/main/spec.md",
221            ),
222            (
223                "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section",
224                "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section",
225            ),
226            (
227                "https://github.com/lycheeverse/lychee/blob/v0.15.0/README.md#features",
228                "https://raw.githubusercontent.com/lycheeverse/lychee/v0.15.0/README.md#features",
229            ),
230        ];
231        for (origin, expect) in &cases {
232            let url = Url::parse(origin).unwrap();
233            let request = Request::new(Method::GET, url);
234            let modified = Quirks::default().apply(request);
235
236            assert_eq!(
237                MockRequest(modified),
238                MockRequest::new(Method::GET, Url::parse(expect).unwrap())
239            );
240        }
241    }
242
243    #[test]
244    fn test_no_quirk_applied() {
245        let url = Url::parse("https://endler.dev").unwrap();
246        let request = Request::new(Method::GET, url.clone());
247        let modified = Quirks::default().apply(request);
248
249        assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url));
250    }
251}