Автоматические откаты, health checks, интеграция с мониторингом.
Надёжный деплой невозможен без автоматических откатов и мониторинга. Изучите health checks, интеграцию с Prometheus/Grafana и стратегии автоматического rollback.
Определяет, готов ли под принимать трафик.
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3Параметры:
initialDelaySeconds — задержка перед первой проверкойperiodSeconds — интервал между проверкамиtimeoutSeconds — таймаут одной проверкиfailureThreshold — число неудач до пометки как UnreadyОпределяет, работает ли под корректно.
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 15
periodSeconds: 20
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3Отличие от readiness:
Для медленно стартующих приложений.
startupProbe:
httpGet:
path: /health/startup
port: 8080
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 30 # 30 * 10 = 300 секунд на стартПрименение:
# Проверка статуса rollout
kubectl rollout status deployment/my-app --timeout=300s
# Просмотр истории ревизий
kubectl rollout history deployment/my-app
# Детали конкретной ревизии
kubectl rollout history deployment/my-app --revision=3
# Откат к предыдущей версии
kubectl rollout undo deployment/my-app
# Откат к конкретной ревизии
kubectl rollout undo deployment/my-app --to-revision=2name: Deploy with Auto-Rollback
on:
push:
branches: [main]
jobs:
deploy:
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Deploy
run: |
kubectl set image deployment/my-app app=myregistry/app:${{ github.sha }}
kubectl rollout status deployment/my-app --timeout=300s
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
- name: Health Check
run: |
for i in {1..30}; do
STATUS=$(curl -sf $PROD_URL/health | jq -r .status)
if [ "$STATUS" = "healthy" ]; then
echo "Health check passed!"
exit 0
fi
echo "Waiting... ($i/30)"
sleep 10
done
echo "Health check failed!"
exit 1
env:
PROD_URL: ${{ vars.PROD_URL }}
- name: Automatic Rollback
if: failure()
run: |
kubectl rollout undo deployment/my-app
kubectl rollout status deployment/my-app --timeout=300s
# Notify team
curl -X POST $SLACK_WEBHOOK \
-H "Content-Type: application/json" \
-d "{\"text\":\"Deployment failed, rolled back to previous version\"}"
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}# helm install prometheus-stack
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install monitoring prometheus-community/kube-prometheus-stack \
-n monitoring \
--create-namespaceapiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: my-app
labels:
app: my-app
spec:
selector:
matchLabels:
app: my-app
endpoints:
- port: http
path: /metrics
interval: 30s
scrapeTimeout: 10sКлючевые метрики:
| Метрика | Описание | Порог |
|---|---|---|
http_requests_total | Всего запросов | - |
http_request_duration_seconds | Латенси запросов | p99 < 500ms |
http_requests_errors_total | Ошибки | error rate < 1% |
kube_deployment_status_replicas_available | Доступные реплики | = desired |
# alerting-rules.yaml
groups:
- name: deployment
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_errors_total{job="my-app"}[5m]))
/ sum(rate(http_requests_total{job="my-app"}[5m])) > 0.01
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{job="my-app"}[5m]))
by (le)) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "P99 latency is {{ $value | humanizeDuration }}"
- alert: DeploymentReplicasMismatch
expr: |
kube_deployment_status_replicas_available{deployment="my-app"}
!= kube_deployment_spec_replicas{deployment="my-app"}
for: 5m
labels:
severity: warning
annotations:
summary: "Deployment replicas mismatch"
description: "Available: {{ $value }}, Desired: {{ $value }}"{
"dashboard": {
"title": "Deployment Monitoring",
"panels": [
{
"title": "Request Rate",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"my-app\"}[5m]))"
}
]
},
{
"title": "Error Rate",
"targets": [
{
"expr": "sum(rate(http_requests_errors_total{job=\"my-app\"}[5m])) / sum(rate(http_requests_total{job=\"my-app\"}[5m]))"
}
]
},
{
"title": "P99 Latency",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"my-app\"}[5m])) by (le))"
}
]
}
]
}
}# values.yaml
monitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s
grafanaDashboard:
enabled: true
alerts:
enabled: true
errorRateThreshold: 0.01
latencyThreshold: 500msname: Canary with Monitoring Rollback
on:
push:
branches: [main]
jobs:
canary-deploy:
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Deploy canary (10%)
run: |
kubectl apply -f k8s/canary-deployment.yaml
kubectl set image deployment/my-app-canary app=myregistry/app:${{ github.sha }}
kubectl rollout status deployment/my-app-canary --timeout=300s
# Set 10% traffic
kubectl patch virtualservice my-app-vs \
--type='json' \
-p='[{"op":"replace","path":"/spec/http/0/route/1/weight","value":10},
{"op":"replace","path":"/spec/http/0/route/0/weight","value":90}]'
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
- name: Monitor canary metrics
id: monitor
run: |
echo "Monitoring canary for 5 minutes..."
for i in {1..10}; do
# Get error rate from Prometheus
ERROR_RATE=$(curl -s http://prometheus:9090/api/v1/query \
-d "query=sum(rate(http_requests_errors_total{version='canary'}[5m]))/sum(rate(http_requests_total{version='canary'}[5m]))" \
| jq -r '.data.result[0].value[1] // "0"')
# Get P99 latency
LATENCY=$(curl -s http://prometheus:9090/api/v1/query \
-d "query=histogram_quantile(0.99,sum(rate(http_request_duration_seconds_bucket{version='canary'}[5m])) by (le))" \
| jq -r '.data.result[0].value[1] // "0"')
echo "Iteration $i: Error Rate=$ERROR_RATE, P99 Latency=${LATENCY}s"
# Check thresholds
if (( $(echo "$ERROR_RATE > 0.01" | bc -l) )); then
echo "Error rate threshold exceeded: $ERROR_RATE"
echo "rollback=true" >> $GITHUB_OUTPUT
exit 1
fi
if (( $(echo "$LATENCY > 0.5" | bc -l) )); then
echo "Latency threshold exceeded: $LATENCY"
echo "rollback=true" >> $GITHUB_OUTPUT
exit 1
fi
sleep 30
done
echo "Canary metrics OK"
echo "rollback=false" >> $GITHUB_OUTPUT
- name: Increase to 50%
if: steps.monitor.outputs.rollback == 'false'
run: |
kubectl patch virtualservice my-app-vs \
--type='json' \
-p='[{"op":"replace","path":"/spec/http/0/route/1/weight","value":50},
{"op":"replace","path":"/spec/http/0/route/0/weight","value":50}]'
- name: Full rollout
if: steps.monitor.outputs.rollback == 'false'
run: |
kubectl patch virtualservice my-app-vs \
--type='json' \
-p='[{"op":"replace","path":"/spec/http/0/route/1/weight","value":100},
{"op":"replace","path":"/spec/http/0/route/0/weight","value":0}]'
- name: Rollback on failure
if: failure()
run: |
echo "Rolling back canary deployment..."
# Reset traffic to stable
kubectl patch virtualservice my-app-vs \
--type='json' \
-p='[{"op":"replace","path":"/spec/http/0/route/1/weight","value":0},
{"op":"replace","path":"/spec/http/0/route/0/weight","value":100}]'
# Delete canary deployment
kubectl delete deployment my-app-canary || true
# Notify team
curl -X POST $SLACK_WEBHOOK \
-H "Content-Type: application/json" \
-d "{\"text\":\"Canary deployment failed and rolled back!\"}"
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}name: Helm Deploy with Rollback
on:
push:
branches: [main]
jobs:
deploy:
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v3
- name: Helm upgrade
id: upgrade
run: |
helm upgrade my-app ./chart \
--namespace production \
--set image.tag=${{ github.sha }} \
--atomic \
--timeout 10m \
--wait
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
- name: Post-deployment verification
run: |
# Check pods are ready
READY=$(kubectl get pods -l app=my-app -n production \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
| tr ' ' '\n' | grep -c True)
DESIRED=$(kubectl get deployment my-app -n production -o jsonpath='{.spec.replicas}')
if [ "$READY" -lt "$DESIRED" ]; then
echo "Not all pods are ready!"
exit 1
fi
# Health check
curl -sf $PROD_URL/health || exit 1
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}
PROD_URL: ${{ vars.PROD_URL }}
- name: Rollback on failure
if: failure()
run: |
helm rollback my-app -n production
echo "Rolled back to previous Helm revision"
env:
KUBECONFIG: ${{ secrets.KUBE_CONFIG }}name: Deploy Notifications
on:
workflow_run:
workflows: ["Deploy"]
types: [completed]
jobs:
notify:
runs-on: ubuntu-latest
steps:
- name: Notify Slack
run: |
STATUS="${{ github.event.workflow_run.conclusion }}"
WORKFLOW="${{ github.event.workflow_run.name }}"
URL="${{ github.event.workflow_run.html_url }}"
if [ "$STATUS" = "success" ]; then
COLOR="good"
EMOJI="✅"
else
COLOR="danger"
EMOJI="❌"
fi
curl -X POST $SLACK_WEBHOOK \
-H "Content-Type: application/json" \
-d "{
\"attachments\": [{
\"color\": \"$COLOR\",
\"title\": \"$EMOJI Deploy $STATUS\",
\"text\": \"Workflow: $WORKFLOW\",
\"actions\": [{
\"type\": \"button\",
\"text\": \"View Details\",
\"url\": \"$URL\"
}]
}]
}"
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - name: Notify PagerDuty on failure
if: failure()
run: |
curl -X POST https://events.pagerduty.com/v2/enqueue \
-H "Content-Type: application/json" \
-d "{
\"routing_key\": \"$PAGERDUTY_ROUTING_KEY\",
\"event_action\": \"trigger\",
\"payload\": {
\"summary\": \"Deployment failed in production\",
\"severity\": \"critical\",
\"source\": \"github-actions\",
\"custom_details\": {
\"repository\": \"$GITHUB_REPOSITORY\",
\"commit\": \"$GITHUB_SHA\",
\"workflow\": \"$GITHUB_WORKFLOW\"
}
}
}"
env:
PAGERDUTY_ROUTING_KEY: ${{ secrets.PAGERDUTY_ROUTING_KEY }}Вопросы ещё не добавлены
Вопросы для этой подтемы ещё не добавлены.