Skip to main content

lychee_lib/extract/html/
srcset.rs

1//! Extract all image URLs from a srcset.
2//!
3//! A `srcset` is a string containing a comma-separated list of one or more
4//! image candidate strings to be used when determining which image resource to
5//! present inside an `<img>` element.
6//!
7//! Each image candidate string must begin with a valid URL referencing a
8//! non-interactive graphic resource. This is followed by whitespace and then a
9//! condition descriptor that indicates the circumstances in which the indicated
10//! image should be used. Space characters, other than the whitespace separating
11//! the URL and the corresponding condition descriptor, are ignored; this
12//! includes both leading and trailing space, as well as space before or after
13//! each comma.
14//!
15//! Note: this handles cases where a URL contains a comma, which should be
16//! escaped, but is a valid character in a URL and occurs in the wild.
17//! Note: we cannot assume that commas within URLs are encoded as `%2C`, as they
18//! should be according to RFC 3986.
19//! Thus, the parsing process becomes significantly more complex and we need to
20//! use a state machine to keep track of the current state.
21
22use log::info;
23use std::result::Result;
24
25enum State {
26    InsideDescriptor,
27    AfterDescriptor,
28    InsideParens,
29}
30
31/// Split an input string at the first character for which
32/// the predicate returns false.
33///
34/// In other words, returns the longest prefix span where `predicate` is
35/// satisfied, along with the rest of the string.
36fn split_at<F>(input: &str, predicate: F) -> (&str, &str)
37where
38    F: Fn(&char) -> bool,
39{
40    for (i, ch) in input.char_indices() {
41        if !predicate(&ch) {
42            return input.split_at(i);
43        }
44    }
45    (input, "")
46}
47
48/// Parse a srcset string into a list of URLs.
49//
50// This state-machine is a bit convoluted, but we keep everything in one place
51// for simplicity so we have to please clippy.
52pub(crate) fn parse(input: &str) -> Vec<&str> {
53    let mut candidates: Vec<&str> = Vec::new();
54    let mut remaining = input;
55    while !remaining.is_empty() {
56        remaining = match parse_one_url(remaining) {
57            Ok((rem, None)) => rem,
58            Ok((rem, Some(url))) => {
59                candidates.push(url);
60                rem
61            }
62            Err(e) => {
63                info!("{e}");
64                return vec![];
65            }
66        }
67    }
68
69    candidates
70}
71
72/// Implements one iteration of the "splitting loop" from the reference algorithm.
73/// This is intended to be repeatedly called until the remaining string is empty.
74///
75/// Returns a tuple of remaining string and an optional parsed URL, if successful.
76/// Otherwise, in case of srcset syntax errors, returns Err.
77///
78/// <https://html.spec.whatwg.org/multipage/images.html#parsing-a-srcset-attribute>
79fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> {
80    let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace());
81
82    if start.find(',').is_some() {
83        return Err("srcset parse error (too many commas)".to_string());
84    }
85
86    if remaining.is_empty() {
87        return Ok(("", None));
88    }
89
90    let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace());
91
92    let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
93    if comma_count > 1 {
94        return Err("srcset parse error (trailing commas)".to_string());
95    }
96
97    let url = url.get(..url.len() - comma_count);
98
99    let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace);
100
101    let remaining = skip_descriptor(remaining);
102
103    Ok((remaining, url))
104}
105
106/// Helper function to skip over a descriptor. Returns the string remaining
107/// after the descriptor (i.e. a string beginning after the next comma or an
108/// empty string).
109#[allow(clippy::single_match)]
110fn skip_descriptor(remaining: &str) -> &str {
111    let mut state = State::InsideDescriptor;
112
113    for (i, c) in remaining.char_indices() {
114        match state {
115            State::InsideDescriptor => match c {
116                c if c.is_ascii_whitespace() => state = State::AfterDescriptor,
117                '(' => state = State::InsideParens,
118                ',' => return &remaining[i + c.len_utf8()..], // returns string after this comma
119                _ => (),
120            },
121            State::InsideParens => match c {
122                ')' => state = State::InsideDescriptor,
123                _ => (),
124            },
125            State::AfterDescriptor => match c {
126                c if c.is_ascii_whitespace() => (),
127                _ => state = State::InsideDescriptor,
128            },
129        }
130    }
131
132    ""
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn test_collect_sequence_characters_with_empty_string() {
141        let (sequence, remainder) = split_at("", |c| c.is_alphabetic());
142        assert_eq!(sequence, "");
143        assert_eq!(remainder, "");
144    }
145
146    #[test]
147    fn test_collect_sequence_characters_with_alphabetic_predicate() {
148        let (sequence, remainder) = split_at("abc123", |c| c.is_alphabetic());
149        assert_eq!(sequence, "abc");
150        assert_eq!(remainder, "123");
151    }
152
153    #[test]
154    fn test_collect_sequence_characters_with_digit_predicate() {
155        let (sequence, remainder) = split_at("123abc", char::is_ascii_digit);
156        assert_eq!(sequence, "123");
157        assert_eq!(remainder, "abc");
158    }
159
160    #[test]
161    fn test_collect_sequence_characters_with_no_match() {
162        let (sequence, remainder) = split_at("123abc", |c| c.is_whitespace());
163        assert_eq!(sequence, "");
164        assert_eq!(remainder, "123abc");
165    }
166
167    #[test]
168    fn test_collect_sequence_characters_with_all_match() {
169        let (sequence, remainder) = split_at("123abc", |c| !c.is_whitespace());
170        assert_eq!(sequence, "123abc");
171        assert_eq!(remainder, "");
172    }
173
174    #[test]
175    fn test_parse_no_value() {
176        assert!(parse("").is_empty());
177    }
178
179    #[test]
180    fn test_parse_url_one_value() {
181        let candidates = vec!["test-img-320w.jpg".to_string()];
182        assert_eq!(parse("test-img-320w.jpg 320w"), candidates);
183    }
184
185    #[test]
186    fn test_parse_srcset_two_values() {
187        assert_eq!(
188            parse("test-img-320w.jpg 320w, test-img-480w.jpg 480w"),
189            vec![
190                "test-img-320w.jpg".to_string(),
191                "test-img-480w.jpg".to_string(),
192            ]
193        );
194    }
195
196    #[test]
197    fn test_parse_srcset_with_unencoded_comma() {
198        assert_eq!(
199            parse(
200                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w"
201            ),
202            vec![
203                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
204                "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
205            ]
206        );
207    }
208
209    #[test]
210    fn test_parse_srcset_url() {
211        assert_eq!(
212            parse("https://example.com/image1.jpg 1x, https://example.com/image2.jpg 2x"),
213            vec![
214                "https://example.com/image1.jpg",
215                "https://example.com/image2.jpg"
216            ]
217        );
218    }
219
220    #[test]
221    fn test_parse_srcset_with_commas() {
222        assert_eq!(
223            parse(
224                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w"
225            ),
226            vec![
227                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg",
228                "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg"
229            ]
230        );
231    }
232
233    #[test]
234    fn test_parse_srcset_without_spaces() {
235        assert_eq!(
236            parse(
237                "/300.png 300w,/600.png 600w,/900.png 900w,https://x.invalid/a.png 1000w,relative.png 10w"
238            ),
239            vec![
240                "/300.png",
241                "/600.png",
242                "/900.png",
243                "https://x.invalid/a.png",
244                "relative.png"
245            ]
246        );
247    }
248}