Skip to content

Cluster Health Status #4750

Cluster Health Status

Cluster Health Status #4750

name: Cluster Health Status
on:
schedule:
- cron: '0 * * * *'
push:
paths:
- .github/workflows/*
workflow_dispatch:
jobs:
health-check:
name: Monitor Cluster Endpoints
runs-on: ubuntu-latest
steps:
- name: Check endpoints
run: |
ENDPOINTS="${{ secrets.CLUSTER_ENDPOINTS }}"
# Validate input
if [[ -z "$ENDPOINTS" ]]; then
echo "❌ No endpoints found in CLUSTER_ENDPOINTS secret"
exit 1
fi
# Initialize counters
total=0
failed=0
# Function to check endpoint with retries
check_endpoint() {
local url="$1"
local max_attempts=3
local attempt=1
while [[ $attempt -le $max_attempts ]]; do
if [[ $attempt -gt 1 ]]; then
echo " Retry $((attempt - 1))/$((max_attempts - 1))..."
sleep 2 # Wait 2 seconds between retries
fi
if curl -sfLI --max-time 10 "$url" >/dev/null 2>&1; then
if [[ $attempt -gt 1 ]]; then
echo "✅ OK (succeeded on attempt $attempt)"
else
echo "✅ OK"
fi
return 0
fi
attempt=$((attempt + 1))
done
echo "❌ FAILED (all $max_attempts attempts failed)"
return 1
}
# Process each endpoint
while IFS= read -r url; do
# Skip empty lines and comments
[[ -z "$url" || "$url" =~ ^[[:space:]]*# ]] && continue
url=$(echo "$url" | xargs)
[[ -z "$url" ]] && continue
total=$((total + 1))
echo "Testing: $url"
if ! check_endpoint "$url"; then
failed=$((failed + 1))
fi
echo ""
done <<< "$ENDPOINTS"
# Summary
echo "📊 Summary: $((total - failed))/$total endpoints OK"
# Exit with error if any failed
if [[ $failed -gt 0 ]]; then
echo "💥 $failed endpoint(s) failed!"
exit 1
else
echo "🎉 All endpoints healthy!"
fi