Skip to content

Error Handling Examples


Basic Retry with Backoff

# Example: Basic Retry with Backoff
# Shows how to automatically retry failed steps with configurable
# max attempts, interval, and exponential backoff.
#
# Retry is useful for flaky operations like network calls,
# cloud API requests, or CI/CD deployments.
#
# Try: orchstep run deploy-with-retry
# Try: orchstep run health-check
name: basic-retry-demo
desc: "Automatic retry for unreliable operations"
tasks:
# -- Retry a deployment that may fail transiently --
deploy-with-retry:
desc: "Deploy service with automatic retry on failure"
steps:
- name: deploy_service
func: shell
do: |
echo "Deploying payment-service..."
# In real usage: kubectl apply -f deployment.yml
# Simulating a flaky deploy that succeeds on attempt 3
COUNT_FILE="/tmp/deploy_attempt"
if [ -f "$COUNT_FILE" ]; then
COUNT=$(cat $COUNT_FILE)
else
COUNT=0
fi
COUNT=$((COUNT + 1))
echo $COUNT > $COUNT_FILE
echo "Attempt: $COUNT"
if [ $COUNT -lt 3 ]; then
echo "Error: connection reset by peer"
exit 1
fi
echo "Deployment successful"
retry:
max_attempts: 5 # Try up to 5 times
interval: 100ms # Start with 100ms between attempts
backoff_rate: 1.5 # Each wait = previous * 1.5
outputs:
result: "{{ result.output }}"
- name: cleanup
func: shell
do: rm -f /tmp/deploy_attempt
# -- Retry with a delay cap --
health-check:
desc: "Health check with capped backoff delay"
steps:
- name: check_service
func: shell
do: |
echo "Checking service health..."
# In real usage: curl -f http://service:8080/healthz
echo "Service responding"
retry:
max_attempts: 4
interval: 500ms # Start at 500ms
backoff_rate: 3.0 # Aggressive backoff: 500ms -> 1.5s -> 4.5s
max_delay: 2s # But never wait more than 2 seconds

Conditional Retry

# Example: Conditional Retry
# Shows how to retry only when a specific condition is met,
# using the `when` clause inside retry configuration.
#
# Conditions can be written in JavaScript or Go template syntax.
# Available context: result.exit_code, result.output, retry.attempt, vars.*
#
# Try: orchstep run retry-on-timeout
# Try: orchstep run retry-on-output-match
name: conditional-retry-demo
desc: "Retry only when specific conditions are met"
tasks:
# -- Retry only on timeout exit codes --
retry-on-timeout:
desc: "Retry network calls only when they time out (exit code 124)"
steps:
- name: call_external_api
func: shell
do: |
echo "Calling payment gateway..."
# In real usage: curl --max-time 5 https://api.payment.com/charge
# Exit code 124 = timeout, 0 = success, other = permanent error
echo "Request completed"
retry:
max_attempts: 3
interval: 2s
# JavaScript expression: only retry on timeout
when: |
result.exit_code == 124
# -- Retry based on output content --
retry-on-output-match:
desc: "Retry only when output indicates a transient error"
steps:
- name: sync_database
func: shell
do: |
echo "Syncing database replica..."
# In real usage: pg_basebackup or similar
echo "Sync complete"
retry:
max_attempts: 5
interval: 1s
# JavaScript: retry when output contains "timeout" or "connection refused"
when: |
result.exit_code != 0 && result.output.includes('timeout')
# -- Retry with attempt-aware logic --
retry-with-escalation:
desc: "Change retry behavior based on attempt number"
steps:
- name: provision_resource
func: shell
do: |
echo "Provisioning cloud resource..."
# In real usage: terraform apply
echo "Resource provisioned"
retry:
max_attempts: 5
interval: 1s
# JavaScript: stop retrying after 3 attempts for exit code 1
when: |
retry.attempt < 3 && result.exit_code == 1
# -- Go template syntax for conditions --
retry-go-template:
desc: "Retry using Go template when condition"
steps:
- name: deploy_container
func: shell
do: |
echo "Pulling and deploying container image..."
# In real usage: docker pull && docker run
echo "Container deployed"
retry:
max_attempts: 4
interval: 500ms
# Go template syntax: retry on non-zero exit with retryable output
when: '{{ and (ne .result.exit_code 0) (contains "Retryable" .result.output) }}'
# -- Retry with variable-based conditions --
retry-with-vars:
desc: "Use task variables in retry conditions"
vars:
max_retryable_exit_code: 10
steps:
- name: run_migration
func: shell
do: |
echo "Running database migration..."
# In real usage: flyway migrate or alembic upgrade
echo "Migration complete"
retry:
max_attempts: 3
interval: 500ms
# JavaScript: only retry if exit code is within retryable range
when: |
result.exit_code <= vars.max_retryable_exit_code
# -- Multiple AND conditions --
retry-multi-condition:
desc: "Retry only when multiple conditions match"
steps:
- name: push_to_registry
func: shell
do: |
echo "Pushing image to container registry..."
# In real usage: docker push myregistry.io/app:latest
echo "Push complete"
retry:
max_attempts: 5
interval: 1s
# JavaScript: all conditions must be true to retry
when: |
result.exit_code == 2 &&
result.output.includes('WARN') &&
result.output.includes('temporary')

on_error Modes (fail / ignore / warn)

# Example: on_error Modes (fail / ignore / warn)
# Controls what happens when a step fails:
#
# fail - (default) Stop the task immediately
# ignore - Silently continue to the next step
# warn - Continue but mark the step with "warning" status
#
# Use `steps.<name>.status` and `steps.<name>.error` to inspect
# the result of warned/ignored steps in subsequent steps.
#
# Try: orchstep run graceful-monitoring
# Try: orchstep run best-effort-cleanup
name: on-error-modes-demo
desc: "Control error behavior per step: fail, ignore, or warn"
tasks:
# -- Default behavior: fail stops the task --
strict-pipeline:
desc: "Default fail mode - task stops on first error"
steps:
- name: critical_step
func: shell
do: |
echo "Running critical validation..."
# If this fails, the task stops immediately
echo "Validation passed"
- name: next_step
func: shell
do: |
echo "This only runs if the previous step succeeded"
# -- Ignore mode: continue despite errors --
best-effort-cleanup:
desc: "Clean up resources, ignoring individual failures"
steps:
- name: delete_temp_files
func: shell
do: |
echo "Deleting temporary files..."
# Even if this fails, continue with other cleanup
rm -rf /tmp/orchstep-build-* 2>/dev/null || true
echo "Temp files cleaned"
on_error: ignore
- name: remove_containers
func: shell
do: |
echo "Removing stopped containers..."
# docker rm $(docker ps -aq --filter status=exited) 2>/dev/null
echo "Containers cleaned"
on_error: ignore
- name: cleanup_summary
func: shell
do: |
echo "Cleanup finished (errors were ignored)"
# -- Warn mode: continue with status tracking --
graceful-monitoring:
desc: "Monitor multiple services, tracking warnings"
steps:
- name: check_database
func: shell
do: |
echo "Checking database health..."
# In real usage: pg_isready -h db-host
echo "Database OK"
on_error: warn
- name: check_cache
func: shell
do: |
echo "Checking cache health..."
# In real usage: redis-cli ping
echo "Cache OK"
on_error: warn
- name: check_queue
func: shell
do: |
echo "Checking message queue health..."
# In real usage: rabbitmqctl status
echo "Queue OK"
on_error: warn
- name: report_health
func: shell
do: |
echo "=== Health Check Report ==="
echo "Database: {{ steps.check_database.status }}"
echo "Cache: {{ steps.check_cache.status }}"
echo "Queue: {{ steps.check_queue.status }}"
# -- Combine on_error with retry --
retry-then-warn:
desc: "Retry first, then warn if all attempts fail"
steps:
- name: sync_metrics
func: shell
do: |
echo "Pushing metrics to monitoring service..."
# In real usage: curl -X POST http://prometheus/api/v1/write
echo "Metrics pushed"
retry:
max_attempts: 3
interval: 500ms
on_error: warn # If all retries fail, warn but continue
- name: continue_pipeline
func: shell
do: |
echo "Pipeline continues regardless of metrics push"
echo "Metrics step status: {{ steps.sync_metrics.status }}"
# -- on_error with loops --
loop-with-warnings:
desc: "Process items in a loop, warning on individual failures"
steps:
- name: process_batch
loop:
count: 3
func: shell
do: |
echo "Processing item {{ loop.index }}..."
# Some items may fail, but we want to process them all
echo "Item {{ loop.index }} done"
on_error: warn
# -- Mixed modes in a single workflow --
mixed-error-modes:
desc: "Different error modes for different step importance"
steps:
- name: optional_warmup
func: shell
do: echo "Warming up cache (optional)..."
on_error: ignore # Not critical, skip silently
- name: advisory_check
func: shell
do: echo "Running advisory security scan..."
on_error: warn # Want to know, but not blocking
- name: critical_deploy
func: shell
do: echo "Deploying to production..."
# on_error defaults to "fail" - this MUST succeed

Retry with Exponential Backoff and Jitter

# Example: Retry with Exponential Backoff and Jitter
# Shows how to prevent "thundering herd" problems by adding
# random jitter to retry delays.
#
# Jitter adds randomness to retry intervals so that multiple
# clients retrying simultaneously don't all hit the server
# at the same moment.
#
# jitter: 0.0 = no randomness (deterministic delays)
# jitter: 0.3 = +/-30% variation around the calculated delay
# jitter: 0.5 = +/-50% variation
# jitter: 1.0 = +/-100% variation (0 to 2x the delay)
#
# Try: orchstep run resilient-deploy
# Try: orchstep run api-with-backoff
name: retry-with-jitter-demo
desc: "Resilient retry with jitter to prevent thundering herd"
tasks:
# -- Basic jitter for distributed retries --
resilient-deploy:
desc: "Deploy with jitter to avoid retry storms"
steps:
- name: deploy_to_cluster
func: shell
do: |
echo "Deploying application to cluster..."
# In real usage: kubectl apply -f deployment.yml
echo "Deployment successful"
retry:
max_attempts: 5
interval: 2s
jitter: 0.3 # +/-30% variation: delays between 1.4s and 2.6s
# -- Exponential backoff with jitter --
api-with-backoff:
desc: "API calls with exponential backoff and jitter"
steps:
- name: call_rate_limited_api
func: shell
do: |
echo "Calling rate-limited API..."
# In real usage: curl https://api.example.com/data
echo "Response received"
retry:
max_attempts: 5
interval: 100ms
backoff_rate: 2.0 # Double the delay each time
jitter: 0.3 # +/-30% jitter on each backoff delay
# Approximate delays:
# Attempt 1->2: ~100ms (70ms - 130ms)
# Attempt 2->3: ~200ms (140ms - 260ms)
# Attempt 3->4: ~400ms (280ms - 520ms)
# Attempt 4->5: ~800ms (560ms - 1040ms)
# -- Jitter with max delay cap --
capped-backoff:
desc: "Backoff with jitter capped at a maximum delay"
steps:
- name: sync_to_remote
func: shell
do: |
echo "Syncing data to remote storage..."
# In real usage: aws s3 sync ./data s3://bucket/
echo "Sync complete"
retry:
max_attempts: 5
interval: 100ms
backoff_rate: 3.0 # Triple each time (aggressive backoff)
max_delay: 500ms # Never wait more than 500ms
jitter: 0.5 # +/-50% on the capped delay
# Delays: 100ms, 300ms, 500ms (capped), 500ms (capped)
# With jitter: varies +/-50% around each value
# -- High jitter for maximum spread --
distributed-workers:
desc: "Maximum jitter spread for distributed worker retries"
steps:
- name: acquire_lock
func: shell
do: |
echo "Acquiring distributed lock..."
# In real usage: redis SET lock NX EX 30
echo "Lock acquired"
retry:
max_attempts: 4
interval: 200ms
jitter: 1.0 # +/-100%: delays between 0ms and 400ms
# Maximum spread prevents multiple workers from colliding
# -- Multiple steps with independent jitter --
parallel-service-calls:
desc: "Each step retries with its own jitter pattern"
steps:
- name: call_auth_service
func: shell
do: echo "Authenticating with auth service..."
retry:
max_attempts: 3
interval: 100ms
jitter: 0.3
on_error: warn
- name: call_data_service
func: shell
do: echo "Fetching data from data service..."
retry:
max_attempts: 3
interval: 200ms
jitter: 0.5
on_error: warn
- name: aggregate_results
func: shell
do: |
echo "Aggregating service responses..."
echo "Auth status: {{ steps.call_auth_service.status }}"
echo "Data status: {{ steps.call_data_service.status }}"

Timeout Management

# Example: Timeout Management
# Shows how to set time limits on step execution and combine
# timeouts with retry for resilient operations.
#
# Supported duration formats: 500ms, 2s, 1m, 5m
# Timeout exit code: 124 (can be used in retry conditions)
#
# Try: orchstep run api-call-with-timeout
# Try: orchstep run timeout-with-retry
name: timeout-management-demo
desc: "Control step execution time with timeouts"
tasks:
# -- Basic timeout on a step --
api-call-with-timeout:
desc: "Prevent API calls from hanging indefinitely"
steps:
- name: fetch_user_data
func: shell
do: |
echo "Fetching user data from API..."
# In real usage: curl --max-time 5 https://api.example.com/users
sleep 0.5
echo "Data received"
timeout: 5s # Kill step if it takes longer than 5 seconds
- name: process_results
func: shell
do: |
echo "Processing user data..."
echo "Done"
# -- Different timeout formats --
timeout-formats:
desc: "Demonstrate different duration format options"
steps:
- name: quick_check
func: shell
do: echo "Fast operation"
timeout: 500ms # Milliseconds
- name: moderate_task
func: shell
do: echo "Medium operation"
timeout: 10s # Seconds
- name: long_running_job
func: shell
do: echo "Long operation"
timeout: 5m # Minutes
# -- Timeout combined with retry --
timeout-with-retry:
desc: "Retry operations that time out, with per-attempt timeout"
steps:
- name: wait_for_service
func: shell
do: |
echo "Waiting for service to become ready..."
# In real usage: curl http://service:8080/ready
# Each attempt has its own 2-second timeout window
sleep 0.5
echo "Service is ready"
timeout: 2s # Each attempt gets 2 seconds max
retry:
max_attempts: 4
interval: 1s
outputs:
status: "{{ result.output }}"
# -- Retry only on timeout (not other errors) --
selective-timeout-retry:
desc: "Retry on timeout but fail fast on other errors"
steps:
- name: deploy_and_wait
func: shell
do: |
echo "Deploying and waiting for rollout..."
# In real usage: kubectl rollout status deployment/app
echo "Rollout complete"
timeout: 30s
retry:
max_attempts: 3
interval: 5s
# Exit code 124 = timeout. Only retry timeouts.
when: |
result.exit_code == 124
# -- Timeout with error handling --
timeout-with-catch:
desc: "Handle timeout failures gracefully"
steps:
- name: long_running_report
func: shell
do: |
echo "Generating quarterly report..."
# In real usage: complex data aggregation query
sleep 0.5
echo "Report generated"
timeout: 5s
catch:
- name: handle_report_timeout
func: shell
do: |
echo "Report generation timed out"
echo "Scheduling async report job instead..."
echo "Exit code: {{ vars.error.exit_code }}"
# In real usage: enqueue background job
finally:
- name: notify_status
func: shell
do: |
echo "Sending status notification..."
echo "Report task completed"

Try / Catch / Finally Error Handling

# Example: Try / Catch / Finally Error Handling
# Shows the full error handling chain: retry -> catch -> finally.
#
# - catch: runs when a step fails (after retries are exhausted)
# - finally: always runs, whether the step succeeded or failed
# - catch + finally can be combined for robust error recovery
#
# The error context (vars.error) provides: step_name, exit_code,
# output, message, timestamp, and attempt count.
#
# Try: orchstep run deploy-pipeline
# Try: orchstep run database-migration
# Try: orchstep run full-error-chain
name: try-catch-finally-demo
desc: "Robust error handling with catch and finally blocks"
tasks:
# -- Catch block for error recovery --
deploy-pipeline:
desc: "Deploy with rollback on failure"
steps:
- name: deploy_to_production
func: shell
do: |
echo "Deploying v2.5.0 to production cluster..."
# In real usage: kubectl apply -f manifests/
echo "Deployment applied"
catch:
# Catch runs if deploy_to_production fails
- name: rollback_deployment
func: shell
do: |
echo "Deployment failed! Rolling back..."
echo "Error: {{ vars.error.message }}"
echo "Failed step: {{ vars.error.step_name }}"
# In real usage: kubectl rollout undo deployment/app
echo "Rollback complete"
- name: notify_team
func: shell
do: |
echo "Sending failure notification..."
echo "Exit code was: {{ vars.error.exit_code }}"
# In real usage: curl -X POST slack-webhook-url
# This step runs if deploy succeeded or catch recovered
- name: verify_deployment
func: shell
do: |
echo "Running smoke tests..."
echo "All checks passed"
# -- Finally block for guaranteed cleanup --
database-migration:
desc: "Database migration with guaranteed cleanup"
steps:
- name: run_migration
func: shell
do: |
echo "Acquiring migration lock..."
echo "Running schema changes..."
# In real usage: flyway migrate -url=jdbc:postgresql://db:5432/app
echo "Migration complete"
finally:
# Finally ALWAYS runs - success or failure
- name: release_lock
func: shell
do: |
echo "Releasing migration lock..."
# In real usage: release distributed lock
echo "Lock released"
- name: log_completion
func: shell
do: |
echo "Logging migration result..."
# Check if there was an error
echo "Error context: {{ vars.error | default \"none\" }}"
# -- Full chain: retry + catch + finally --
full-error-chain:
desc: "Complete error handling: retry, catch, and finally combined"
steps:
- name: sync_data
func: shell
do: |
echo "Syncing data to backup region..."
# In real usage: aws s3 sync or rsync
echo "Sync complete"
# 1. First, retry on transient failures
retry:
max_attempts: 3
interval: 1s
# 2. If all retries fail, catch handles the error
catch:
- name: log_sync_failure
func: shell
do: |
echo "Data sync failed after retries"
echo "Attempts made: {{ vars.error.attempt }}"
echo "Last error: {{ vars.error.output }}"
# Mark region as degraded instead of failing hard
echo "Marking backup region as degraded"
# 3. Finally always runs (cleanup, metrics, etc.)
finally:
- name: report_metrics
func: shell
do: |
echo "Recording sync metrics..."
# In real usage: push to Prometheus / Datadog
echo "Metrics recorded"
# -- Conditional retry + catch fallback --
conditional-with-catch:
desc: "Stop retrying on fatal errors, fall through to catch"
steps:
- name: connect_to_service
func: shell
do: |
echo "Connecting to external service..."
echo "Connection established"
retry:
max_attempts: 5
interval: 500ms
# Only retry on transient errors (exit code 1)
# Fatal errors (exit code 2+) go straight to catch
when: |
result.exit_code == 1
catch:
- name: handle_fatal_error
func: shell
do: |
echo "Fatal error detected: {{ vars.error.output }}"
echo "Switching to fallback service..."
finally:
- name: close_connections
func: shell
do: |
echo "Closing all connections..."
echo "Cleanup complete"
# -- Multi-step workflow with mixed error handling --
multi-step-pipeline:
desc: "Pipeline where each step has its own error handling"
steps:
- name: build_artifact
func: shell
do: |
echo "Building application artifact..."
echo "Build successful"
retry:
max_attempts: 3
interval: 200ms
finally:
- name: cleanup_build_cache
func: shell
do: echo "Clearing build cache..."
- name: run_tests
func: shell
do: |
echo "Running integration tests..."
echo "All tests passed"
catch:
- name: collect_test_logs
func: shell
do: echo "Collecting test failure logs..."
finally:
- name: teardown_test_env
func: shell
do: echo "Tearing down test environment..."