📋 Table of Contents
- Introduction to Failure Scenarios
- Basic Retry Pattern
- Exponential Backoff
- Retry with Context and Timeout
- Circuit Breaker Pattern
- Jitter for Distributed Systems
- Best Practices
🎯 Introduction to Failure Scenarios
In distributed systems and network programming, failures are inevitable. Common scenarios include:
- Network timeouts and temporary connectivity issues
- Service unavailability or overload
- Rate limiting by external APIs
- Database connection failures
- Transient errors that resolve themselves
Implementing proper retry mechanisms helps make your applications more resilient and reliable.
🔁 Basic Retry Pattern
The simplest retry mechanism attempts an operation multiple times with a fixed delay between attempts.
Example: Simple Retry Function
package main
import (
"errors"
"fmt"
"time"
)
// Retry executes the given function up to maxAttempts times
func Retry(maxAttempts int, delay time.Duration, fn func() error) error {
var err error
for attempt := 1; attempt <= maxAttempts; attempt++ {
err = fn()
if err == nil {
return nil // Success
}
if attempt < maxAttempts {
fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n",
attempt, err, delay)
time.Sleep(delay)
}
}
return fmt.Errorf("after %d attempts, last error: %w", maxAttempts, err)
}
// Example usage
func main() {
counter := 0
// Simulated operation that fails twice then succeeds
operation := func() error {
counter++
if counter < 3 {
return errors.New("temporary failure")
}
return nil
}
err := Retry(5, 1*time.Second, operation)
if err != nil {
fmt.Printf("Operation failed: %v\n", err)
} else {
fmt.Println("Operation succeeded!")
}
}
⏱️ Exponential Backoff
Exponential backoff increases the delay between retries exponentially, reducing load on failing services and improving the chances of recovery.
Example: Exponential Backoff Implementation
package main
import (
"fmt"
"math"
"time"
)
// RetryWithBackoff implements exponential backoff retry
func RetryWithBackoff(maxAttempts int, initialDelay time.Duration,
maxDelay time.Duration, fn func() error) error {
var err error
delay := initialDelay
for attempt := 1; attempt <= maxAttempts; attempt++ {
err = fn()
if err == nil {
return nil
}
if attempt < maxAttempts {
// Calculate exponential backoff: delay * 2^(attempt-1)
delay = time.Duration(float64(initialDelay) * math.Pow(2, float64(attempt-1)))
// Cap the delay at maxDelay
if delay > maxDelay {
delay = maxDelay
}
fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n",
attempt, err, delay)
time.Sleep(delay)
}
}
return fmt.Errorf("failed after %d attempts: %w", maxAttempts, err)
}
// Example: HTTP request with exponential backoff
func main() {
attempts := 0
operation := func() error {
attempts++
if attempts < 4 {
return fmt.Errorf("service unavailable")
}
return nil
}
err := RetryWithBackoff(
5, // max attempts
500*time.Millisecond, // initial delay
10*time.Second, // max delay
operation,
)
if err != nil {
fmt.Printf("Failed: %v\n", err)
} else {
fmt.Println("Success!")
}
}
⏰ Retry with Context and Timeout
Using Go's context package allows you to set timeouts and cancellation for retry operations, preventing indefinite retries.
Example: Context-Aware Retry
package main
import (
"context"
"fmt"
"time"
)
// RetryWithContext respects context cancellation and timeouts
func RetryWithContext(ctx context.Context, maxAttempts int,
delay time.Duration, fn func(context.Context) error) error {
var err error
for attempt := 1; attempt <= maxAttempts; attempt++ {
// Check if context is cancelled
select {
case <-ctx.Done():
return fmt.Errorf("operation cancelled: %w", ctx.Err())
default:
}
err = fn(ctx)
if err == nil {
return nil
}
if attempt < maxAttempts {
fmt.Printf("Attempt %d failed: %v. Retrying...\n", attempt, err)
// Sleep with context awareness
select {
case <-time.After(delay):
// Continue to next attempt
case <-ctx.Done():
return fmt.Errorf("retry cancelled: %w", ctx.Err())
}
}
}
return fmt.Errorf("max attempts reached: %w", err)
}
// Example usage with timeout
func main() {
// Create context with 5 second timeout
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
attempts := 0
operation := func(ctx context.Context) error {
attempts++
// Simulate slow operation
time.Sleep(2 * time.Second)
if attempts < 5 {
return fmt.Errorf("service busy")
}
return nil
}
err := RetryWithContext(ctx, 10, 1*time.Second, operation)
if err != nil {
fmt.Printf("Operation failed: %v\n", err)
} else {
fmt.Println("Operation succeeded!")
}
}
🔌 Circuit Breaker Pattern
The circuit breaker pattern prevents repeatedly calling a failing service. It has three states: Closed (normal), Open (failing), and Half-Open (testing recovery).
Example: Simple Circuit Breaker
package main
import (
"errors"
"fmt"
"sync"
"time"
)
type State int
const (
StateClosed State = iota
StateOpen
StateHalfOpen
)
type CircuitBreaker struct {
maxFailures int
resetTimeout time.Duration
failures int
state State
lastFailTime time.Time
mu sync.Mutex
}
func NewCircuitBreaker(maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
maxFailures: maxFailures,
resetTimeout: resetTimeout,
state: StateClosed,
}
}
func (cb *CircuitBreaker) Call(fn func() error) error {
cb.mu.Lock()
defer cb.mu.Unlock()
// Check if we should transition from Open to Half-Open
if cb.state == StateOpen {
if time.Since(cb.lastFailTime) > cb.resetTimeout {
cb.state = StateHalfOpen
cb.failures = 0
fmt.Println("Circuit breaker: Open -> Half-Open")
} else {
return errors.New("circuit breaker is open")
}
}
// Execute the function
err := fn()
if err != nil {
cb.failures++
cb.lastFailTime = time.Now()
if cb.failures >= cb.maxFailures {
cb.state = StateOpen
fmt.Printf("Circuit breaker opened after %d failures\n", cb.failures)
}
return err
}
// Success - reset to closed state
if cb.state == StateHalfOpen {
cb.state = StateClosed
fmt.Println("Circuit breaker: Half-Open -> Closed")
}
cb.failures = 0
return nil
}
// Example usage
func main() {
cb := NewCircuitBreaker(3, 5*time.Second)
// Simulate multiple calls
for i := 1; i <= 10; i++ {
err := cb.Call(func() error {
// Simulate failing service
if i <= 5 {
return fmt.Errorf("service error")
}
return nil
})
if err != nil {
fmt.Printf("Call %d failed: %v\n", i, err)
} else {
fmt.Printf("Call %d succeeded\n", i)
}
time.Sleep(1 * time.Second)
}
}
🎲 Adding Jitter for Distributed Systems
Jitter adds randomness to retry delays, preventing synchronized retries from multiple clients that could overwhelm a recovering service.
Example: Exponential Backoff with Jitter
package main
import (
"fmt"
"math"
"math/rand"
"time"
)
// RetryWithJitter adds random jitter to exponential backoff
func RetryWithJitter(maxAttempts int, baseDelay time.Duration,
maxDelay time.Duration, fn func() error) error {
var err error
for attempt := 1; attempt <= maxAttempts; attempt++ {
err = fn()
if err == nil {
return nil
}
if attempt < maxAttempts {
// Calculate exponential backoff
expDelay := float64(baseDelay) * math.Pow(2, float64(attempt-1))
// Add jitter: random value between 0 and expDelay
jitter := rand.Float64() * expDelay
delay := time.Duration(jitter)
// Cap at maxDelay
if delay > maxDelay {
delay = maxDelay
}
fmt.Printf("Attempt %d failed. Retrying in %v...\n", attempt, delay)
time.Sleep(delay)
}
}
return fmt.Errorf("failed after %d attempts: %w", maxAttempts, err)
}
func main() {
attempts := 0
operation := func() error {
attempts++
if attempts < 3 {
return fmt.Errorf("temporary error")
}
return nil
}
err := RetryWithJitter(5, 100*time.Millisecond, 5*time.Second, operation)
if err != nil {
fmt.Printf("Failed: %v\n", err)
} else {
fmt.Println("Success!")
}
}
📚 Complete Real-World Example
Here's a comprehensive example combining multiple patterns for making resilient HTTP requests.
package main
import (
"context"
"fmt"
"math"
"math/rand"
"net/http"
"time"
)
type RetryConfig struct {
MaxAttempts int
InitialDelay time.Duration
MaxDelay time.Duration
Timeout time.Duration
}
// ResilientHTTPGet makes an HTTP GET request with retry logic
func ResilientHTTPGet(ctx context.Context, url string, config RetryConfig) (*http.Response, error) {
client := &http.Client{
Timeout: 10 * time.Second,
}
var lastErr error
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
// Check context
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
// Attempt the request
resp, err := client.Do(req)
if err == nil && resp.StatusCode < 500 {
// Success or client error (don't retry 4xx)
return resp, nil
}
if resp != nil {
resp.Body.Close()
lastErr = fmt.Errorf("status code: %d", resp.StatusCode)
} else {
lastErr = err
}
// Calculate backoff with jitter
if attempt < config.MaxAttempts {
expDelay := float64(config.InitialDelay) * math.Pow(2, float64(attempt-1))
jitter := rand.Float64() * expDelay
delay := time.Duration(jitter)
if delay > config.MaxDelay {
delay = config.MaxDelay
}
fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n",
attempt, lastErr, delay)
select {
case <-time.After(delay):
case <-ctx.Done():
return nil, ctx.Err()
}
}
}
return nil, fmt.Errorf("all attempts failed: %w", lastErr)
}
func main() {
config := RetryConfig{
MaxAttempts: 5,
InitialDelay: 500 * time.Millisecond,
MaxDelay: 10 * time.Second,
Timeout: 30 * time.Second,
}
ctx, cancel := context.WithTimeout(context.Background(), config.Timeout)
defer cancel()
resp, err := ResilientHTTPGet(ctx, "https://api.example.com/data", config)
if err != nil {
fmt.Printf("Request failed: %v\n", err)
return
}
defer resp.Body.Close()
fmt.Printf("Success! Status: %d\n", resp.StatusCode)
}
✨ Best Practices Summary
- Use exponential backoff to reduce load on failing services
- Add jitter to prevent thundering herd in distributed systems
- Respect context cancellation for proper resource management
- Set maximum retry limits to prevent infinite loops
- Distinguish error types - don't retry client errors (4xx)
- Log retry attempts for debugging and monitoring
- Use circuit breakers to fail fast when services are down
- Set reasonable timeouts for overall operations
- Consider idempotency - ensure operations can be safely retried
github.com/cenkalti/backoff or github.com/sony/gobreaker that provide robust implementations of these patterns.