lychee_lib/archive/wayback/
mod.rs1use std::sync::LazyLock;
2use std::time::Duration;
3
4use serde::de::Error as SerdeError;
5use serde::{Deserialize, Deserializer};
6
7use http::StatusCode;
8use reqwest::{Client, Error, Url};
9
10static WAYBACK_URL: LazyLock<Url> =
11 LazyLock::new(|| Url::parse("https://archive.org/wayback/available").unwrap());
12
13pub(crate) async fn get_archive_snapshot(
14 url: &Url,
15 timeout: Duration,
16) -> Result<Option<Url>, Error> {
17 get_archive_snapshot_internal(url, timeout, WAYBACK_URL.clone()).await
18}
19
20async fn get_archive_snapshot_internal(
21 url: &Url,
22 timeout: Duration,
23 mut api: Url,
24) -> Result<Option<Url>, Error> {
25 let url = url.to_string();
26
27 let stripped = url.strip_suffix("/").unwrap_or(&url);
29 api.set_query(Some(&format!("url={stripped}")));
30
31 let response = Client::builder()
32 .timeout(timeout)
33 .build()?
34 .get(api)
35 .send()
36 .await?
37 .json::<InternetArchiveResponse>()
38 .await?;
39
40 Ok(response
41 .archived_snapshots
42 .closest
43 .map(|closest| closest.url))
44}
45
46#[derive(Debug, Deserialize, Eq, PartialEq)]
47pub(crate) struct InternetArchiveResponse {
48 pub(crate) url: Url,
49 pub(crate) archived_snapshots: ArchivedSnapshots,
50}
51
52#[derive(Debug, Deserialize, Eq, PartialEq)]
53pub(crate) struct ArchivedSnapshots {
54 pub(crate) closest: Option<Closest>,
55}
56
57#[derive(Debug, Deserialize, Eq, PartialEq)]
58pub(crate) struct Closest {
59 #[serde(deserialize_with = "from_string")]
60 pub(crate) status: StatusCode,
61 pub(crate) available: bool,
62 pub(crate) url: Url,
63 pub(crate) timestamp: String,
64}
65
66fn from_string<'d, D>(deserializer: D) -> Result<StatusCode, D::Error>
67where
68 D: Deserializer<'d>,
69{
70 let value: &str = Deserialize::deserialize(deserializer)?;
71 let result = value
72 .parse::<u16>()
73 .map_err(|e| D::Error::custom(e.to_string()))?;
74 StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string()))
75}
76
77#[cfg(test)]
78mod tests {
79 use crate::archive::wayback::{get_archive_snapshot, get_archive_snapshot_internal};
80 use http::StatusCode;
81 use reqwest::{Client, Error, Url};
82 use std::{error::Error as StdError, time::Duration};
83 use wiremock::matchers::query_param;
84
85 const TIMEOUT: Duration = Duration::from_secs(20);
86
87 #[tokio::test]
88 async fn wayback_suggestion_mocked() -> Result<(), Box<dyn StdError>> {
93 let mock_server = wiremock::MockServer::start().await;
94 let api_url = mock_server.uri();
95 let api_response = wiremock::ResponseTemplate::new(StatusCode::OK).set_body_raw(
96 r#"
97 {
98 "url": "https://google.com/jobs.html",
99 "archived_snapshots": {
100 "closest": {
101 "available": true,
102 "url": "http://web.archive.org/web/20130919044612/http://example.com/",
103 "timestamp": "20130919044612",
104 "status": "200"
105 }
106 }
107 }
108 "#,
109 "application/json",
110 );
111
112 let url_to_restore = "https://example.com".parse::<Url>()?;
113 wiremock::Mock::given(wiremock::matchers::method("GET"))
114 .and(query_param(
115 "url",
116 url_to_restore.as_str().strip_suffix("/").unwrap(),
117 ))
118 .respond_with(api_response)
119 .mount(&mock_server)
120 .await;
121
122 let result =
123 get_archive_snapshot_internal(&url_to_restore, TIMEOUT, api_url.parse()?).await;
124
125 assert_eq!(
126 result?,
127 Some("http://web.archive.org/web/20130919044612/http://example.com/".parse()?)
128 );
129
130 Ok(())
131 }
132
133 #[tokio::test]
134 async fn wayback_api_no_breaking_changes() -> Result<(), Error> {
137 let api_docs_url = "https://archive.org/help/wayback_api.php";
138 let html = Client::builder()
139 .timeout(TIMEOUT)
140 .build()?
141 .get(api_docs_url)
142 .send()
143 .await?
144 .text()
145 .await?;
146
147 assert!(html.contains("Updated on September, 24, 2013"));
148 Ok(())
149 }
150
151 #[ignore = "
152 It is flaky because the API does not reliably return snapshots,
153 i.e. the `archived_snapshots` field is unreliable.
154 That's why the test is ignored. For development and documentation this test is still useful."]
155 #[tokio::test]
156 async fn wayback_suggestion_real() -> Result<(), Box<dyn StdError>> {
158 let url = &"https://example.com".try_into()?;
159 let response = get_archive_snapshot(url, TIMEOUT).await?;
160 assert_eq!(
161 response,
162 Some("http://web.archive.org/web/20250603204626/http://www.example.com/".parse()?)
163 );
164 Ok(())
165 }
166
167 #[tokio::test]
168 async fn wayback_suggestion_real_unknown() -> Result<(), Box<dyn StdError>> {
172 let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man".try_into()?;
173 let response = get_archive_snapshot(url, TIMEOUT).await?;
174 assert_eq!(response, None);
175 Ok(())
176 }
177}