Saltar a contenido

SRE / Platform Engineer - Guía de Entorno

Guía para configurar el entorno de Site Reliability Engineering con Claude Code.

Resumen de Capacidades

Capacidad Herramientas
Observabilidad Prometheus, Grafana, Jaeger
Logging ELK Stack, Loki, Fluentd
Tracing OpenTelemetry, Jaeger, Zipkin
Alerting Alertmanager, PagerDuty
Incident Management Blameless, incident.io
Chaos Engineering Chaos Monkey, Litmus

Observabilidad

Prometheus

# Docker
docker run -d --name prometheus \
  -p 9090:9090 \
  -v prometheus.yml:/etc/prometheus/prometheus.yml \
  prom/prometheus

# Verificar
curl http://localhost:9090/-/healthy

prometheus.yml:

global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

rule_files:
  - "alerts/*.yml"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']

  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod

Grafana

# Docker
docker run -d --name grafana \
  -p 3000:3000 \
  -v grafana-data:/var/lib/grafana \
  grafana/grafana

# Login: admin/admin

# CLI
brew install grafana  # macOS
grafana-cli plugins install grafana-piechart-panel

Loki (Logs)

# Docker Compose
# docker-compose.yml
services:
  loki:
    image: grafana/loki:2.9.0
    ports:
      - "3100:3100"
    command: -config.file=/etc/loki/local-config.yaml

  promtail:
    image: grafana/promtail:2.9.0
    volumes:
      - /var/log:/var/log
    command: -config.file=/etc/promtail/config.yml

# LogQL queries
{job="varlogs"} |= "error"
{namespace="production"} | json | level="error"

OpenTelemetry

Collector

# Docker
docker run -d --name otel-collector \
  -p 4317:4317 \
  -p 4318:4318 \
  -v otel-config.yaml:/etc/otel/config.yaml \
  otel/opentelemetry-collector:latest \
  --config=/etc/otel/config.yaml

otel-config.yaml:

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch:
    timeout: 1s
    send_batch_size: 1024

exporters:
  prometheus:
    endpoint: "0.0.0.0:8889"

  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true

  loki:
    endpoint: http://loki:3100/loki/api/v1/push

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [jaeger]
    metrics:
      receivers: [otlp]
      processors: [batch]
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [loki]

Jaeger (Tracing)

# All-in-one para desarrollo
docker run -d --name jaeger \
  -p 16686:16686 \
  -p 4317:4317 \
  -p 4318:4318 \
  jaegertracing/all-in-one:latest

# UI: http://localhost:16686

Alerting

Alertmanager

docker run -d --name alertmanager \
  -p 9093:9093 \
  -v alertmanager.yml:/etc/alertmanager/alertmanager.yml \
  prom/alertmanager

alertmanager.yml:

global:
  slack_api_url: 'https://hooks.slack.com/services/xxx'

route:
  group_by: ['alertname', 'severity']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'slack-notifications'

  routes:
    - match:
        severity: critical
      receiver: 'pagerduty-critical'

receivers:
  - name: 'slack-notifications'
    slack_configs:
      - channel: '#alerts'
        title: '{{ .CommonAnnotations.summary }}'
        text: '{{ .CommonAnnotations.description }}'

  - name: 'pagerduty-critical'
    pagerduty_configs:
      - service_key: 'xxx'

Alert Rules

# alerts/app.yml
groups:
  - name: app-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          /
          sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      - alert: HighLatency
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"

SLI/SLO

SLI Definitions

# Availability SLI
- record: sli:availability:ratio
  expr: |
    sum(rate(http_requests_total{status!~"5.."}[5m]))
    /
    sum(rate(http_requests_total[5m]))

# Latency SLI (P99 < 200ms)
- record: sli:latency:ratio
  expr: |
    sum(rate(http_request_duration_seconds_bucket{le="0.2"}[5m]))
    /
    sum(rate(http_request_duration_seconds_count[5m]))

Error Budget

# Error budget remaining
- record: error_budget:remaining
  expr: |
    1 - (
      (1 - sli:availability:ratio)
      /
      (1 - 0.999)  # SLO: 99.9%
    )

Chaos Engineering

Litmus Chaos

# Instalar en Kubernetes
kubectl apply -f https://litmuschaos.github.io/litmus/litmus-operator-v3.0.0.yaml

# Crear experimento
kubectl apply -f - <<EOF
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: pod-kill-chaos
spec:
  appinfo:
    appns: default
    applabel: "app=myapp"
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-delete
      spec:
        components:
          env:
            - name: TOTAL_CHAOS_DURATION
              value: '30'
            - name: CHAOS_INTERVAL
              value: '10'
EOF

Chaos Toolkit

pip install chaostoolkit chaostoolkit-kubernetes

# Ejecutar experimento
chaos run experiment.json

Runbooks & Automation

Runbook Template

# Runbook: High CPU Usage

## Symptoms
- Alert: HighCPUUsage
- CPU > 80% for > 5 minutes

## Investigation
1. Check current CPU usage:
   ```
   kubectl top pods -n production
   ```

2. Check recent deployments:
   ```
   kubectl rollout history deployment/app
   ```

3. Check logs:
   ```
   kubectl logs -l app=myapp --tail=100
   ```

## Remediation
1. Scale horizontally:
   ```
   kubectl scale deployment/app --replicas=5
   ```

2. If OOM, increase resources:
   ```
   kubectl set resources deployment/app -c=app --limits=cpu=2000m,memory=2Gi
   ```

## Escalation
- If not resolved in 15 minutes, page on-call engineer
- Slack: #incident-response

CLI Tools

k9s

brew install k9s
k9s  # TUI para Kubernetes

stern (Log tailing)

brew install stern
stern -n production app  # Logs de todos los pods con label app
stern -n production ".*" --since 1h

kubectx/kubens

brew install kubectx

kubectx          # Cambiar contexto
kubens           # Cambiar namespace
kubens production

Comandos que Claude Code Ejecutará

# Prometheus
curl localhost:9090/api/v1/query?query=up
promtool check config prometheus.yml

# Logs
stern -n production app
kubectl logs -f deployment/app

# Metrics
kubectl top nodes
kubectl top pods

# Chaos
chaos run experiment.json
kubectl apply -f chaos-experiment.yaml

# Alertmanager
amtool check-config alertmanager.yml
amtool alert query

Stack Local (Docker Compose)

# docker-compose.observability.yml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin

  loki:
    image: grafana/loki:latest
    ports:
      - "3100:3100"

  jaeger:
    image: jaegertracing/all-in-one:latest
    ports:
      - "16686:16686"
      - "4317:4317"

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
docker compose -f docker-compose.observability.yml up -d

VS Code Extensions

code --install-extension ms-kubernetes-tools.vscode-kubernetes-tools
code --install-extension Tim-Koehler.helm-intellisense
code --install-extension hashicorp.terraform
code --install-extension redhat.vscode-yaml

Verificación del Entorno

#!/bin/bash
echo "=== Verificación Entorno SRE ==="

echo -e "\n--- Kubernetes ---"
kubectl version --client 2>/dev/null | head -1 || echo "kubectl no instalado"
k9s version 2>/dev/null | head -1 || echo "k9s no instalado"
stern --version 2>/dev/null || echo "stern no instalado"

echo -e "\n--- Observability ---"
docker ps --format "{{.Names}}" | grep -E "prometheus|grafana|loki|jaeger" || echo "No hay servicios de observabilidad corriendo"

echo -e "\n--- Chaos ---"
chaos --version 2>/dev/null || echo "Chaos Toolkit no instalado"

echo -e "\n=== Verificación Completa ==="

Recursos