🔄 Go Failure Handling & Retry Mechanisms

A Comprehensive Guide to Building Resilient Applications

📋 Table of Contents

🎯 Introduction to Failure Scenarios

In distributed systems and network programming, failures are inevitable. Common scenarios include:

Implementing proper retry mechanisms helps make your applications more resilient and reliable.

🔁 Basic Retry Pattern

The simplest retry mechanism attempts an operation multiple times with a fixed delay between attempts.

Example: Simple Retry Function

package main

import (
    "errors"
    "fmt"
    "time"
)

// Retry executes the given function up to maxAttempts times
func Retry(maxAttempts int, delay time.Duration, fn func() error) error {
    var err error
    
    for attempt := 1; attempt <= maxAttempts; attempt++ {
        err = fn()
        if err == nil {
            return nil // Success
        }
        
        if attempt < maxAttempts {
            fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n", 
                attempt, err, delay)
            time.Sleep(delay)
        }
    }
    
    return fmt.Errorf("after %d attempts, last error: %w", maxAttempts, err)
}

// Example usage
func main() {
    counter := 0
    
    // Simulated operation that fails twice then succeeds
    operation := func() error {
        counter++
        if counter < 3 {
            return errors.New("temporary failure")
        }
        return nil
    }
    
    err := Retry(5, 1*time.Second, operation)
    if err != nil {
        fmt.Printf("Operation failed: %v\n", err)
    } else {
        fmt.Println("Operation succeeded!")
    }
}
💡 Note: Simple retry with fixed delays works well for temporary glitches but can overwhelm systems experiencing high load.

⏱️ Exponential Backoff

Exponential backoff increases the delay between retries exponentially, reducing load on failing services and improving the chances of recovery.

Example: Exponential Backoff Implementation

package main

import (
    "fmt"
    "math"
    "time"
)

// RetryWithBackoff implements exponential backoff retry
func RetryWithBackoff(maxAttempts int, initialDelay time.Duration, 
    maxDelay time.Duration, fn func() error) error {
    
    var err error
    delay := initialDelay
    
    for attempt := 1; attempt <= maxAttempts; attempt++ {
        err = fn()
        if err == nil {
            return nil
        }
        
        if attempt < maxAttempts {
            // Calculate exponential backoff: delay * 2^(attempt-1)
            delay = time.Duration(float64(initialDelay) * math.Pow(2, float64(attempt-1)))
            
            // Cap the delay at maxDelay
            if delay > maxDelay {
                delay = maxDelay
            }
            
            fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n", 
                attempt, err, delay)
            time.Sleep(delay)
        }
    }
    
    return fmt.Errorf("failed after %d attempts: %w", maxAttempts, err)
}

// Example: HTTP request with exponential backoff
func main() {
    attempts := 0
    
    operation := func() error {
        attempts++
        if attempts < 4 {
            return fmt.Errorf("service unavailable")
        }
        return nil
    }
    
    err := RetryWithBackoff(
        5,                      // max attempts
        500*time.Millisecond,   // initial delay
        10*time.Second,         // max delay
        operation,
    )
    
    if err != nil {
        fmt.Printf("Failed: %v\n", err)
    } else {
        fmt.Println("Success!")
    }
}

⏰ Retry with Context and Timeout

Using Go's context package allows you to set timeouts and cancellation for retry operations, preventing indefinite retries.

Example: Context-Aware Retry

package main

import (
    "context"
    "fmt"
    "time"
)

// RetryWithContext respects context cancellation and timeouts
func RetryWithContext(ctx context.Context, maxAttempts int, 
    delay time.Duration, fn func(context.Context) error) error {
    
    var err error
    
    for attempt := 1; attempt <= maxAttempts; attempt++ {
        // Check if context is cancelled
        select {
        case <-ctx.Done():
            return fmt.Errorf("operation cancelled: %w", ctx.Err())
        default:
        }
        
        err = fn(ctx)
        if err == nil {
            return nil
        }
        
        if attempt < maxAttempts {
            fmt.Printf("Attempt %d failed: %v. Retrying...\n", attempt, err)
            
            // Sleep with context awareness
            select {
            case <-time.After(delay):
                // Continue to next attempt
            case <-ctx.Done():
                return fmt.Errorf("retry cancelled: %w", ctx.Err())
            }
        }
    }
    
    return fmt.Errorf("max attempts reached: %w", err)
}

// Example usage with timeout
func main() {
    // Create context with 5 second timeout
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
    defer cancel()
    
    attempts := 0
    operation := func(ctx context.Context) error {
        attempts++
        // Simulate slow operation
        time.Sleep(2 * time.Second)
        if attempts < 5 {
            return fmt.Errorf("service busy")
        }
        return nil
    }
    
    err := RetryWithContext(ctx, 10, 1*time.Second, operation)
    if err != nil {
        fmt.Printf("Operation failed: %v\n", err)
    } else {
        fmt.Println("Operation succeeded!")
    }
}
⚠️ Warning: Always respect context cancellation to prevent resource leaks and unnecessary work when operations are cancelled.

🔌 Circuit Breaker Pattern

The circuit breaker pattern prevents repeatedly calling a failing service. It has three states: Closed (normal), Open (failing), and Half-Open (testing recovery).

Example: Simple Circuit Breaker

package main

import (
    "errors"
    "fmt"
    "sync"
    "time"
)

type State int

const (
    StateClosed State = iota
    StateOpen
    StateHalfOpen
)

type CircuitBreaker struct {
    maxFailures  int
    resetTimeout time.Duration
    failures     int
    state        State
    lastFailTime time.Time
    mu           sync.Mutex
}

func NewCircuitBreaker(maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
    return &CircuitBreaker{
        maxFailures:  maxFailures,
        resetTimeout: resetTimeout,
        state:        StateClosed,
    }
}

func (cb *CircuitBreaker) Call(fn func() error) error {
    cb.mu.Lock()
    defer cb.mu.Unlock()
    
    // Check if we should transition from Open to Half-Open
    if cb.state == StateOpen {
        if time.Since(cb.lastFailTime) > cb.resetTimeout {
            cb.state = StateHalfOpen
            cb.failures = 0
            fmt.Println("Circuit breaker: Open -> Half-Open")
        } else {
            return errors.New("circuit breaker is open")
        }
    }
    
    // Execute the function
    err := fn()
    
    if err != nil {
        cb.failures++
        cb.lastFailTime = time.Now()
        
        if cb.failures >= cb.maxFailures {
            cb.state = StateOpen
            fmt.Printf("Circuit breaker opened after %d failures\n", cb.failures)
        }
        
        return err
    }
    
    // Success - reset to closed state
    if cb.state == StateHalfOpen {
        cb.state = StateClosed
        fmt.Println("Circuit breaker: Half-Open -> Closed")
    }
    cb.failures = 0
    
    return nil
}

// Example usage
func main() {
    cb := NewCircuitBreaker(3, 5*time.Second)
    
    // Simulate multiple calls
    for i := 1; i <= 10; i++ {
        err := cb.Call(func() error {
            // Simulate failing service
            if i <= 5 {
                return fmt.Errorf("service error")
            }
            return nil
        })
        
        if err != nil {
            fmt.Printf("Call %d failed: %v\n", i, err)
        } else {
            fmt.Printf("Call %d succeeded\n", i)
        }
        
        time.Sleep(1 * time.Second)
    }
}

🎲 Adding Jitter for Distributed Systems

Jitter adds randomness to retry delays, preventing synchronized retries from multiple clients that could overwhelm a recovering service.

Example: Exponential Backoff with Jitter

package main

import (
    "fmt"
    "math"
    "math/rand"
    "time"
)

// RetryWithJitter adds random jitter to exponential backoff
func RetryWithJitter(maxAttempts int, baseDelay time.Duration, 
    maxDelay time.Duration, fn func() error) error {
    
    var err error
    
    for attempt := 1; attempt <= maxAttempts; attempt++ {
        err = fn()
        if err == nil {
            return nil
        }
        
        if attempt < maxAttempts {
            // Calculate exponential backoff
            expDelay := float64(baseDelay) * math.Pow(2, float64(attempt-1))
            
            // Add jitter: random value between 0 and expDelay
            jitter := rand.Float64() * expDelay
            delay := time.Duration(jitter)
            
            // Cap at maxDelay
            if delay > maxDelay {
                delay = maxDelay
            }
            
            fmt.Printf("Attempt %d failed. Retrying in %v...\n", attempt, delay)
            time.Sleep(delay)
        }
    }
    
    return fmt.Errorf("failed after %d attempts: %w", maxAttempts, err)
}

func main() {
    attempts := 0
    
    operation := func() error {
        attempts++
        if attempts < 3 {
            return fmt.Errorf("temporary error")
        }
        return nil
    }
    
    err := RetryWithJitter(5, 100*time.Millisecond, 5*time.Second, operation)
    if err != nil {
        fmt.Printf("Failed: %v\n", err)
    } else {
        fmt.Println("Success!")
    }
}
✅ Best Practice: Use jitter in distributed systems to prevent the "thundering herd" problem where many clients retry simultaneously.

📚 Complete Real-World Example

Here's a comprehensive example combining multiple patterns for making resilient HTTP requests.

package main

import (
    "context"
    "fmt"
    "math"
    "math/rand"
    "net/http"
    "time"
)

type RetryConfig struct {
    MaxAttempts  int
    InitialDelay time.Duration
    MaxDelay     time.Duration
    Timeout      time.Duration
}

// ResilientHTTPGet makes an HTTP GET request with retry logic
func ResilientHTTPGet(ctx context.Context, url string, config RetryConfig) (*http.Response, error) {
    client := &http.Client{
        Timeout: 10 * time.Second,
    }
    
    var lastErr error
    
    for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
        // Check context
        select {
        case <-ctx.Done():
            return nil, ctx.Err()
        default:
        }
        
        // Create request with context
        req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
        if err != nil {
            return nil, err
        }
        
        // Attempt the request
        resp, err := client.Do(req)
        if err == nil && resp.StatusCode < 500 {
            // Success or client error (don't retry 4xx)
            return resp, nil
        }
        
        if resp != nil {
            resp.Body.Close()
            lastErr = fmt.Errorf("status code: %d", resp.StatusCode)
        } else {
            lastErr = err
        }
        
        // Calculate backoff with jitter
        if attempt < config.MaxAttempts {
            expDelay := float64(config.InitialDelay) * math.Pow(2, float64(attempt-1))
            jitter := rand.Float64() * expDelay
            delay := time.Duration(jitter)
            
            if delay > config.MaxDelay {
                delay = config.MaxDelay
            }
            
            fmt.Printf("Attempt %d failed: %v. Retrying in %v...\n", 
                attempt, lastErr, delay)
            
            select {
            case <-time.After(delay):
            case <-ctx.Done():
                return nil, ctx.Err()
            }
        }
    }
    
    return nil, fmt.Errorf("all attempts failed: %w", lastErr)
}

func main() {
    config := RetryConfig{
        MaxAttempts:  5,
        InitialDelay: 500 * time.Millisecond,
        MaxDelay:     10 * time.Second,
        Timeout:      30 * time.Second,
    }
    
    ctx, cancel := context.WithTimeout(context.Background(), config.Timeout)
    defer cancel()
    
    resp, err := ResilientHTTPGet(ctx, "https://api.example.com/data", config)
    if err != nil {
        fmt.Printf("Request failed: %v\n", err)
        return
    }
    defer resp.Body.Close()
    
    fmt.Printf("Success! Status: %d\n", resp.StatusCode)
}

✨ Best Practices Summary

📖 Further Reading: For production systems, consider using battle-tested libraries like github.com/cenkalti/backoff or github.com/sony/gobreaker that provide robust implementations of these patterns.