Skip to main content

lychee_lib/archive/wayback/
mod.rs

1use std::sync::LazyLock;
2use std::time::Duration;
3
4use serde::de::Error as SerdeError;
5use serde::{Deserialize, Deserializer};
6
7use http::StatusCode;
8use reqwest::{Client, Error, Url};
9
10static WAYBACK_URL: LazyLock<Url> =
11    LazyLock::new(|| Url::parse("https://archive.org/wayback/available").unwrap());
12
13pub(crate) async fn get_archive_snapshot(
14    url: &Url,
15    timeout: Duration,
16) -> Result<Option<Url>, Error> {
17    get_archive_snapshot_internal(url, timeout, WAYBACK_URL.clone()).await
18}
19
20async fn get_archive_snapshot_internal(
21    url: &Url,
22    timeout: Duration,
23    mut api: Url,
24) -> Result<Option<Url>, Error> {
25    let url = url.to_string();
26
27    // The Wayback API doesn't return any snapshots for URLs with trailing slashes
28    let stripped = url.strip_suffix("/").unwrap_or(&url);
29    api.set_query(Some(&format!("url={stripped}")));
30
31    let response = Client::builder()
32        .timeout(timeout)
33        .build()?
34        .get(api)
35        .send()
36        .await?
37        .json::<InternetArchiveResponse>()
38        .await?;
39
40    Ok(response
41        .archived_snapshots
42        .closest
43        .map(|closest| closest.url))
44}
45
46#[derive(Debug, Deserialize, Eq, PartialEq)]
47pub(crate) struct InternetArchiveResponse {
48    pub(crate) url: Url,
49    pub(crate) archived_snapshots: ArchivedSnapshots,
50}
51
52#[derive(Debug, Deserialize, Eq, PartialEq)]
53pub(crate) struct ArchivedSnapshots {
54    pub(crate) closest: Option<Closest>,
55}
56
57#[derive(Debug, Deserialize, Eq, PartialEq)]
58pub(crate) struct Closest {
59    #[serde(deserialize_with = "from_string")]
60    pub(crate) status: StatusCode,
61    pub(crate) available: bool,
62    pub(crate) url: Url,
63    pub(crate) timestamp: String,
64}
65
66fn from_string<'d, D>(deserializer: D) -> Result<StatusCode, D::Error>
67where
68    D: Deserializer<'d>,
69{
70    let value: &str = Deserialize::deserialize(deserializer)?;
71    let result = value
72        .parse::<u16>()
73        .map_err(|e| D::Error::custom(e.to_string()))?;
74    StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string()))
75}
76
77#[cfg(test)]
78mod tests {
79    use crate::archive::wayback::{get_archive_snapshot, get_archive_snapshot_internal};
80    use http::StatusCode;
81    use reqwest::{Client, Error, Url};
82    use std::{error::Error as StdError, time::Duration};
83    use wiremock::matchers::query_param;
84
85    const TIMEOUT: Duration = Duration::from_secs(20);
86
87    #[tokio::test]
88    /// Test retrieval by mocking the Wayback API.
89    /// We mock their API because unfortunately it happens quite often that the
90    /// `archived_snapshots` field is empty because the API is unreliable.
91    /// This way we avoid flaky tests.
92    async fn wayback_suggestion_mocked() -> Result<(), Box<dyn StdError>> {
93        let mock_server = wiremock::MockServer::start().await;
94        let api_url = mock_server.uri();
95        let api_response = wiremock::ResponseTemplate::new(StatusCode::OK).set_body_raw(
96            r#"
97                {
98                    "url": "https://google.com/jobs.html",
99                    "archived_snapshots": {
100                        "closest": {
101                            "available": true,
102                            "url": "http://web.archive.org/web/20130919044612/http://example.com/",
103                            "timestamp": "20130919044612",
104                            "status": "200"
105                        }
106                    }
107                }
108                "#,
109            "application/json",
110        );
111
112        let url_to_restore = "https://example.com".parse::<Url>()?;
113        wiremock::Mock::given(wiremock::matchers::method("GET"))
114            .and(query_param(
115                "url",
116                url_to_restore.as_str().strip_suffix("/").unwrap(),
117            ))
118            .respond_with(api_response)
119            .mount(&mock_server)
120            .await;
121
122        let result =
123            get_archive_snapshot_internal(&url_to_restore, TIMEOUT, api_url.parse()?).await;
124
125        assert_eq!(
126            result?,
127            Some("http://web.archive.org/web/20130919044612/http://example.com/".parse()?)
128        );
129
130        Ok(())
131    }
132
133    #[tokio::test]
134    /// Their API documentation mentions when the last changes occurred.
135    /// Because we mock their API in previous tests we try to detect breaking API changes with this test.
136    async fn wayback_api_no_breaking_changes() -> Result<(), Error> {
137        let api_docs_url = "https://archive.org/help/wayback_api.php";
138        let html = Client::builder()
139            .timeout(TIMEOUT)
140            .build()?
141            .get(api_docs_url)
142            .send()
143            .await?
144            .text()
145            .await?;
146
147        assert!(html.contains("Updated on September, 24, 2013"));
148        Ok(())
149    }
150
151    #[ignore = "
152        It is flaky because the API does not reliably return snapshots,
153        i.e. the `archived_snapshots` field is unreliable.
154        That's why the test is ignored. For development and documentation this test is still useful."]
155    #[tokio::test]
156    /// This tests the real Wayback API without any mocks.
157    async fn wayback_suggestion_real() -> Result<(), Box<dyn StdError>> {
158        let url = &"https://example.com".try_into()?;
159        let response = get_archive_snapshot(url, TIMEOUT).await?;
160        assert_eq!(
161            response,
162            Some("http://web.archive.org/web/20250603204626/http://www.example.com/".parse()?)
163        );
164        Ok(())
165    }
166
167    #[tokio::test]
168    /// This tests the real Wayback API without any mocks.
169    /// The flakiness of the API shouldn't affect this test because it originates from
170    /// the `archived_snapshots` field.
171    async fn wayback_suggestion_real_unknown() -> Result<(), Box<dyn StdError>> {
172        let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man".try_into()?;
173        let response = get_archive_snapshot(url, TIMEOUT).await?;
174        assert_eq!(response, None);
175        Ok(())
176    }
177}