cargo/util/network/
retry.rs

1//! Utilities for retrying a network operation.
2//!
3//! Some network errors are considered "spurious", meaning it is not a real
4//! error (such as a 404 not found) and is likely a transient error (like a
5//! bad network connection) that we can hope will resolve itself shortly. The
6//! [`Retry`] type offers a way to repeatedly perform some kind of network
7//! operation with a delay if it detects one of these possibly transient
8//! errors.
9//!
10//! This supports errors from [`git2`], [`gix`], [`curl`], and
11//! [`HttpNotSuccessful`] 5xx HTTP errors.
12//!
13//! The number of retries can be configured by the user via the `net.retry`
14//! config option. This indicates the number of times to retry the operation
15//! (default 3 times for a total of 4 attempts).
16//!
17//! There are hard-coded constants that indicate how long to sleep between
18//! retries. The constants are tuned to balance a few factors, such as the
19//! responsiveness to the user (we don't want cargo to hang for too long
20//! retrying things), and accommodating things like Cloudfront's default
21//! negative TTL of 10 seconds (if Cloudfront gets a 5xx error for whatever
22//! reason it won't try to fetch again for 10 seconds).
23//!
24//! The timeout also implements a primitive form of random jitter. This is so
25//! that if multiple requests fail at the same time that they don't all flood
26//! the server at the same time when they are retried. This jitter still has
27//! some clumping behavior, but should be good enough.
28//!
29//! [`Retry`] is the core type for implementing retry logic. The
30//! [`Retry::try`] method can be called with a callback, and it will
31//! indicate if it needs to be called again sometime in the future if there
32//! was a possibly transient error. The caller is responsible for sleeping the
33//! appropriate amount of time and then calling [`Retry::try`] again.
34//!
35//! [`with_retry`] is a convenience function that will create a [`Retry`] and
36//! handle repeatedly running a callback until it succeeds, or it runs out of
37//! retries.
38//!
39//! Some interesting resources about retries:
40//! - <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
41//! - <https://en.wikipedia.org/wiki/Exponential_backoff>
42//! - <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After>
43
44use crate::util::errors::HttpNotSuccessful;
45use crate::{CargoResult, GlobalContext};
46use anyhow::Error;
47use rand::Rng;
48use std::cmp::min;
49use std::time::Duration;
50
51/// State for managing retrying a network operation.
52pub struct Retry<'a> {
53    gctx: &'a GlobalContext,
54    /// The number of failed attempts that have been done so far.
55    ///
56    /// Starts at 0, and increases by one each time an attempt fails.
57    retries: u64,
58    /// The maximum number of times the operation should be retried.
59    ///
60    /// 0 means it should never retry.
61    max_retries: u64,
62}
63
64/// The result of attempting some operation via [`Retry::try`].
65pub enum RetryResult<T> {
66    /// The operation was successful.
67    ///
68    /// The wrapped value is the return value of the callback function.
69    Success(T),
70    /// The operation was an error, and it should not be tried again.
71    Err(anyhow::Error),
72    /// The operation failed, and should be tried again in the future.
73    ///
74    /// The wrapped value is the number of milliseconds to wait before trying
75    /// again. The caller is responsible for waiting this long and then
76    /// calling [`Retry::try`] again.
77    Retry(u64),
78}
79
80/// Maximum amount of time a single retry can be delayed (milliseconds).
81const MAX_RETRY_SLEEP_MS: u64 = 10 * 1000;
82/// The minimum initial amount of time a retry will be delayed (milliseconds).
83///
84/// The actual amount of time will be a random value above this.
85const INITIAL_RETRY_SLEEP_BASE_MS: u64 = 500;
86/// The maximum amount of additional time the initial retry will take (milliseconds).
87///
88/// The initial delay will be [`INITIAL_RETRY_SLEEP_BASE_MS`] plus a random range
89/// from 0 to this value.
90const INITIAL_RETRY_JITTER_MS: u64 = 1000;
91
92impl<'a> Retry<'a> {
93    pub fn new(gctx: &'a GlobalContext) -> CargoResult<Retry<'a>> {
94        Ok(Retry {
95            gctx,
96            retries: 0,
97            max_retries: gctx.net_config()?.retry.unwrap_or(3) as u64,
98        })
99    }
100
101    /// Calls the given callback, and returns a [`RetryResult`] which
102    /// indicates whether or not this needs to be called again at some point
103    /// in the future to retry the operation if it failed.
104    pub fn r#try<T>(&mut self, f: impl FnOnce() -> CargoResult<T>) -> RetryResult<T> {
105        match f() {
106            Err(ref e) if maybe_spurious(e) && self.retries < self.max_retries => {
107                let err = e.downcast_ref::<HttpNotSuccessful>();
108                let err_msg = err
109                    .map(|http_err| http_err.display_short())
110                    .unwrap_or_else(|| e.root_cause().to_string());
111                let left_retries = self.max_retries - self.retries;
112                let msg = format!(
113                    "spurious network error ({} {} remaining): {err_msg}",
114                    left_retries,
115                    if left_retries != 1 { "tries" } else { "try" }
116                );
117                if let Err(e) = self.gctx.shell().warn(msg) {
118                    return RetryResult::Err(e);
119                }
120                self.retries += 1;
121                let sleep = err
122                    .and_then(|v| Self::parse_retry_after(v, &jiff::Timestamp::now()))
123                    // Limit the Retry-After to a maximum value to avoid waiting too long.
124                    .map(|retry_after| retry_after.min(MAX_RETRY_SLEEP_MS))
125                    .unwrap_or_else(|| self.next_sleep_ms());
126                RetryResult::Retry(sleep)
127            }
128            Err(e) => RetryResult::Err(e),
129            Ok(r) => RetryResult::Success(r),
130        }
131    }
132
133    /// Gets the next sleep duration in milliseconds.
134    fn next_sleep_ms(&self) -> u64 {
135        if let Ok(sleep) = self.gctx.get_env("__CARGO_TEST_FIXED_RETRY_SLEEP_MS") {
136            return sleep.parse().expect("a u64");
137        }
138
139        if self.retries == 1 {
140            let mut rng = rand::rng();
141            INITIAL_RETRY_SLEEP_BASE_MS + rng.random_range(0..INITIAL_RETRY_JITTER_MS)
142        } else {
143            min(
144                ((self.retries - 1) * 3) * 1000 + INITIAL_RETRY_SLEEP_BASE_MS,
145                MAX_RETRY_SLEEP_MS,
146            )
147        }
148    }
149
150    /// Parse the HTTP `Retry-After` header.
151    /// Returns the number of milliseconds to wait before retrying according to the header.
152    fn parse_retry_after(response: &HttpNotSuccessful, now: &jiff::Timestamp) -> Option<u64> {
153        // Only applies to HTTP 429 (too many requests) and 503 (service unavailable).
154        if !matches!(response.code, 429 | 503) {
155            return None;
156        }
157
158        // Extract the Retry-After header value.
159        let retry_after = response
160            .headers
161            .iter()
162            .filter_map(|h| h.split_once(':'))
163            .map(|(k, v)| (k.trim(), v.trim()))
164            .find(|(k, _)| k.eq_ignore_ascii_case("retry-after"))?
165            .1;
166
167        // First option: Retry-After is a positive integer of seconds to wait.
168        if let Ok(delay_secs) = retry_after.parse::<u32>() {
169            return Some(delay_secs as u64 * 1000);
170        }
171
172        // Second option: Retry-After is a future HTTP date string that tells us when to retry.
173        if let Ok(retry_time) = jiff::fmt::rfc2822::parse(retry_after) {
174            let diff_ms = now
175                .until(&retry_time)
176                .unwrap()
177                .total(jiff::Unit::Millisecond)
178                .unwrap();
179            if diff_ms > 0.0 {
180                return Some(diff_ms as u64);
181            }
182        }
183        None
184    }
185}
186
187fn maybe_spurious(err: &Error) -> bool {
188    if let Some(git_err) = err.downcast_ref::<git2::Error>() {
189        match git_err.class() {
190            git2::ErrorClass::Net
191            | git2::ErrorClass::Os
192            | git2::ErrorClass::Zlib
193            | git2::ErrorClass::Http => return git_err.code() != git2::ErrorCode::Certificate,
194            _ => (),
195        }
196    }
197    if let Some(curl_err) = err.downcast_ref::<curl::Error>() {
198        if curl_err.is_couldnt_connect()
199            || curl_err.is_couldnt_resolve_proxy()
200            || curl_err.is_couldnt_resolve_host()
201            || curl_err.is_operation_timedout()
202            || curl_err.is_recv_error()
203            || curl_err.is_send_error()
204            || curl_err.is_http2_error()
205            || curl_err.is_http2_stream_error()
206            || curl_err.is_ssl_connect_error()
207            || curl_err.is_partial_file()
208        {
209            return true;
210        }
211    }
212    if let Some(not_200) = err.downcast_ref::<HttpNotSuccessful>() {
213        if 500 <= not_200.code && not_200.code < 600 || not_200.code == 429 {
214            return true;
215        }
216    }
217
218    use gix::protocol::transport::IsSpuriousError;
219
220    if let Some(err) = err.downcast_ref::<crate::sources::git::fetch::Error>() {
221        if err.is_spurious() {
222            return true;
223        }
224    }
225
226    false
227}
228
229/// Wrapper method for network call retry logic.
230///
231/// Retry counts provided by Config object `net.retry`. Config shell outputs
232/// a warning on per retry.
233///
234/// Closure must return a `CargoResult`.
235///
236/// # Examples
237///
238/// ```
239/// # use crate::cargo::util::{CargoResult, GlobalContext};
240/// # let download_something = || return Ok(());
241/// # let gctx = GlobalContext::default().unwrap();
242/// use cargo::util::network;
243/// let cargo_result = network::retry::with_retry(&gctx, || download_something());
244/// ```
245pub fn with_retry<T, F>(gctx: &GlobalContext, mut callback: F) -> CargoResult<T>
246where
247    F: FnMut() -> CargoResult<T>,
248{
249    let mut retry = Retry::new(gctx)?;
250    loop {
251        match retry.r#try(&mut callback) {
252            RetryResult::Success(r) => return Ok(r),
253            RetryResult::Err(e) => return Err(e),
254            RetryResult::Retry(sleep) => std::thread::sleep(Duration::from_millis(sleep)),
255        }
256    }
257}
258
259#[test]
260fn with_retry_repeats_the_call_then_works() {
261    use crate::core::Shell;
262
263    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
264    let error1 = HttpNotSuccessful {
265        code: 501,
266        url: "Uri".to_string(),
267        ip: None,
268        body: Vec::new(),
269        headers: Vec::new(),
270    }
271    .into();
272    let error2 = HttpNotSuccessful {
273        code: 502,
274        url: "Uri".to_string(),
275        ip: None,
276        body: Vec::new(),
277        headers: Vec::new(),
278    }
279    .into();
280    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
281    let gctx = GlobalContext::default().unwrap();
282    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
283    let result = with_retry(&gctx, || results.pop().unwrap());
284    assert!(result.is_ok())
285}
286
287#[test]
288fn with_retry_finds_nested_spurious_errors() {
289    use crate::core::Shell;
290
291    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
292    //String error messages are not considered spurious
293    let error1 = anyhow::Error::from(HttpNotSuccessful {
294        code: 501,
295        url: "Uri".to_string(),
296        ip: None,
297        body: Vec::new(),
298        headers: Vec::new(),
299    });
300    let error1 = anyhow::Error::from(error1.context("A non-spurious wrapping err"));
301    let error2 = anyhow::Error::from(HttpNotSuccessful {
302        code: 502,
303        url: "Uri".to_string(),
304        ip: None,
305        body: Vec::new(),
306        headers: Vec::new(),
307    });
308    let error2 = anyhow::Error::from(error2.context("A second chained error"));
309    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
310    let gctx = GlobalContext::default().unwrap();
311    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
312    let result = with_retry(&gctx, || results.pop().unwrap());
313    assert!(result.is_ok())
314}
315
316#[test]
317fn default_retry_schedule() {
318    use crate::core::Shell;
319
320    let spurious = || -> CargoResult<()> {
321        Err(anyhow::Error::from(HttpNotSuccessful {
322            code: 500,
323            url: "Uri".to_string(),
324            ip: None,
325            body: Vec::new(),
326            headers: Vec::new(),
327        }))
328    };
329    let gctx = GlobalContext::default().unwrap();
330    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
331    let mut retry = Retry::new(&gctx).unwrap();
332    match retry.r#try(|| spurious()) {
333        RetryResult::Retry(sleep) => {
334            assert!(
335                sleep >= INITIAL_RETRY_SLEEP_BASE_MS
336                    && sleep < INITIAL_RETRY_SLEEP_BASE_MS + INITIAL_RETRY_JITTER_MS
337            );
338        }
339        _ => panic!("unexpected non-retry"),
340    }
341    match retry.r#try(|| spurious()) {
342        RetryResult::Retry(sleep) => assert_eq!(sleep, 3500),
343        _ => panic!("unexpected non-retry"),
344    }
345    match retry.r#try(|| spurious()) {
346        RetryResult::Retry(sleep) => assert_eq!(sleep, 6500),
347        _ => panic!("unexpected non-retry"),
348    }
349    match retry.r#try(|| spurious()) {
350        RetryResult::Err(_) => {}
351        _ => panic!("unexpected non-retry"),
352    }
353}
354
355#[test]
356fn curle_http2_stream_is_spurious() {
357    let code = curl_sys::CURLE_HTTP2_STREAM;
358    let err = curl::Error::new(code);
359    assert!(maybe_spurious(&err.into()));
360}
361
362#[test]
363fn retry_after_parsing() {
364    use crate::core::Shell;
365    fn spurious(code: u32, header: &str) -> HttpNotSuccessful {
366        HttpNotSuccessful {
367            code,
368            url: "Uri".to_string(),
369            ip: None,
370            body: Vec::new(),
371            headers: vec![header.to_string()],
372        }
373    }
374
375    // Start of year 2025.
376    let now = jiff::Timestamp::new(1735689600, 0).unwrap();
377    let headers = spurious(429, "Retry-After: 10");
378    assert_eq!(Retry::parse_retry_after(&headers, &now), Some(10_000));
379    let headers = spurious(429, "retry-after: Wed, 01 Jan 2025 00:00:10 GMT");
380    let actual = Retry::parse_retry_after(&headers, &now).unwrap();
381    assert_eq!(10000, actual);
382
383    let headers = spurious(429, "Content-Type: text/html");
384    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
385
386    let headers = spurious(429, "retry-after: Fri, 01 Jan 2000 00:00:00 GMT");
387    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
388
389    let headers = spurious(429, "retry-after: -1");
390    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
391
392    let headers = spurious(400, "retry-after: 1");
393    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
394
395    let gctx = GlobalContext::default().unwrap();
396    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
397    let mut retry = Retry::new(&gctx).unwrap();
398    match retry
399        .r#try(|| -> CargoResult<()> { Err(anyhow::Error::from(spurious(429, "Retry-After: 7"))) })
400    {
401        RetryResult::Retry(sleep) => assert_eq!(sleep, 7_000),
402        _ => panic!("unexpected non-retry"),
403    }
404}