Multi-Region Kubernetes Deployment

Deploy production-ready Kubernetes clusters across multiple regions with high availability, disaster recovery, and enterprise-grade security hardening.

Expert Level

2-3 Hours

Enterprise Security

Multi-Region

RFS

Senior Penetration Tester | eJPT, eCPPTv2, CRTP, ADCS CESP

Multi-Region Architecture Overview

Primary Region (US-East)

Main Kubernetes cluster with control plane and worker nodes

Secondary Region (US-West)

Disaster recovery cluster with data replication

Edge Region (EU-West)

Edge cluster for low-latency European traffic

Prerequisites

Required Tools

Terraform ≥ 1.5.0
kubectl ≥ 1.28.0
Helm ≥ 3.12.0
AWS CLI v2 or GCP CLI

Cloud Permissions

EKS/GKE cluster management
VPC and networking
IAM role management
Load balancer configuration

1Multi-Region Infrastructure Setup

Create the foundational infrastructure across multiple regions using Terraform

This setup creates resources in multiple regions. Monitor costs and ensure proper resource cleanup.

Main Terraform Configuration

# main.tf
terraform {
  required_version = ">= 1.5.0"
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.23"
    }
    helm = {
      source  = "hashicorp/helm"
      version = "~> 2.11"
    }
  }
}

# Provider configurations for multiple regions
provider "aws" {
  alias  = "us_east_1"
  region = "us-east-1"
}

provider "aws" {
  alias  = "us_west_2"
  region = "us-west-2"
}

provider "aws" {
  alias  = "eu_west_1"
  region = "eu-west-1"
}

# Primary region EKS cluster
module "primary_cluster" {
  source = "./modules/eks-cluster"
  
  providers = {
    aws = aws.us_east_1
  }
  
  cluster_name    = "primary-k8s-cluster"
  cluster_version = "1.28"
  region          = "us-east-1"
  
  vpc_cidr = "10.0.0.0/16"
  
  node_groups = {
    system = {
      instance_types = ["t3.medium"]
      min_size      = 2
      max_size      = 4
      desired_size  = 2
      
      labels = {
        role = "system"
      }
      
      taints = [{
        key    = "CriticalAddonsOnly"
        value  = "true"
        effect = "NO_SCHEDULE"
      }]
    }
    
    application = {
      instance_types = ["t3.large"]
      min_size      = 3
      max_size      = 10
      desired_size  = 3
      
      labels = {
        role = "application"
      }
    }
  }
  
  tags = {
    Environment = "production"
    Region      = "primary"
    Terraform   = "true"
  }
}

# Secondary region EKS cluster (DR)
module "secondary_cluster" {
  source = "./modules/eks-cluster"
  
  providers = {
    aws = aws.us_west_2
  }
  
  cluster_name    = "secondary-k8s-cluster"
  cluster_version = "1.28"
  region          = "us-west-2"
  
  vpc_cidr = "10.1.0.0/16"
  
  node_groups = {
    system = {
      instance_types = ["t3.medium"]
      min_size      = 1
      max_size      = 3
      desired_size  = 1
      
      labels = {
        role = "system"
      }
    }
    
    application = {
      instance_types = ["t3.large"]
      min_size      = 2
      max_size      = 8
      desired_size  = 2
      
      labels = {
        role = "application"
      }
    }
  }
  
  tags = {
    Environment = "production"
    Region      = "secondary"
    Terraform   = "true"
  }
}

# Edge region EKS cluster
module "edge_cluster" {
  source = "./modules/eks-cluster"
  
  providers = {
    aws = aws.eu_west_1
  }
  
  cluster_name    = "edge-k8s-cluster"
  cluster_version = "1.28"
  region          = "eu-west-1"
  
  vpc_cidr = "10.2.0.0/16"
  
  node_groups = {
    edge = {
      instance_types = ["t3.medium"]
      min_size      = 2
      max_size      = 6
      desired_size  = 2
      
      labels = {
        role = "edge"
      }
    }
  }
  
  tags = {
    Environment = "production"
    Region      = "edge"
    Terraform   = "true"
  }
}

EKS Cluster Module

# modules/eks-cluster/main.tf
data "aws_availability_zones" "available" {
  state = "available"
}

# VPC Configuration
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-vpc"
    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
  })
}

# Internet Gateway
resource "aws_internet_gateway" "main" {
  vpc_id = aws_vpc.main.id
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-igw"
  })
}

# Public Subnets
resource "aws_subnet" "public" {
  count = 3
  
  vpc_id                  = aws_vpc.main.id
  cidr_block              = cidrsubnet(var.vpc_cidr, 8, count.index)
  availability_zone       = data.aws_availability_zones.available.names[count.index]
  map_public_ip_on_launch = true
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-public-${count.index + 1}"
    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
    "kubernetes.io/role/elb" = "1"
  })
}

# Private Subnets
resource "aws_subnet" "private" {
  count = 3
  
  vpc_id            = aws_vpc.main.id
  cidr_block        = cidrsubnet(var.vpc_cidr, 8, count.index + 10)
  availability_zone = data.aws_availability_zones.available.names[count.index]
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-private-${count.index + 1}"
    "kubernetes.io/cluster/${var.cluster_name}" = "owned"
    "kubernetes.io/role/internal-elb" = "1"
  })
}

# NAT Gateways
resource "aws_eip" "nat" {
  count = 3
  domain = "vpc"
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-nat-eip-${count.index + 1}"
  })
}

resource "aws_nat_gateway" "main" {
  count = 3
  
  allocation_id = aws_eip.nat[count.index].id
  subnet_id     = aws_subnet.public[count.index].id
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-nat-${count.index + 1}"
  })
  
  depends_on = [aws_internet_gateway.main]
}

# Route Tables
resource "aws_route_table" "public" {
  vpc_id = aws_vpc.main.id
  
  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.main.id
  }
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-public-rt"
  })
}

resource "aws_route_table" "private" {
  count = 3
  
  vpc_id = aws_vpc.main.id
  
  route {
    cidr_block     = "0.0.0.0/0"
    nat_gateway_id = aws_nat_gateway.main[count.index].id
  }
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-private-rt-${count.index + 1}"
  })
}

# Route Table Associations
resource "aws_route_table_association" "public" {
  count = 3
  
  subnet_id      = aws_subnet.public[count.index].id
  route_table_id = aws_route_table.public.id
}

resource "aws_route_table_association" "private" {
  count = 3
  
  subnet_id      = aws_subnet.private[count.index].id
  route_table_id = aws_route_table.private[count.index].id
}

# EKS Cluster IAM Role
resource "aws_iam_role" "cluster" {
  name = "${var.cluster_name}-cluster-role"
  
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "eks.amazonaws.com"
        }
      }
    ]
  })
  
  tags = var.tags
}

resource "aws_iam_role_policy_attachment" "cluster_AmazonEKSClusterPolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
  role       = aws_iam_role.cluster.name
}

# EKS Cluster Security Group
resource "aws_security_group" "cluster" {
  name_prefix = "${var.cluster_name}-cluster-sg"
  vpc_id      = aws_vpc.main.id
  
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
  
  tags = merge(var.tags, {
    Name = "${var.cluster_name}-cluster-sg"
  })
}

# EKS Cluster
resource "aws_eks_cluster" "main" {
  name     = var.cluster_name
  role_arn = aws_iam_role.cluster.arn
  version  = var.cluster_version
  
  vpc_config {
    subnet_ids              = concat(aws_subnet.private[*].id, aws_subnet.public[*].id)
    endpoint_private_access = true
    endpoint_public_access  = true
    public_access_cidrs     = ["0.0.0.0/0"]
    security_group_ids      = [aws_security_group.cluster.id]
  }
  
  encryption_config {
    provider {
      key_arn = aws_kms_key.eks.arn
    }
    resources = ["secrets"]
  }
  
  enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
  
  depends_on = [
    aws_iam_role_policy_attachment.cluster_AmazonEKSClusterPolicy,
    aws_cloudwatch_log_group.cluster
  ]
  
  tags = var.tags
}

# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "cluster" {
  name              = "/aws/eks/${var.cluster_name}/cluster"
  retention_in_days = 7
  
  tags = var.tags
}

# KMS Key for EKS encryption
resource "aws_kms_key" "eks" {
  description             = "EKS Secret Encryption Key for ${var.cluster_name}"
  deletion_window_in_days = 7
  
  tags = var.tags
}

resource "aws_kms_alias" "eks" {
  name          = "alias/eks-${var.cluster_name}"
  target_key_id = aws_kms_key.eks.key_id
}

2Cross-Region Networking Setup

Configure secure networking between regions with VPC peering and transit gateways

VPC Peering Configuration

# networking.tf
# VPC Peering between Primary and Secondary regions
resource "aws_vpc_peering_connection" "primary_secondary" {
  provider = aws.us_east_1
  
  vpc_id        = module.primary_cluster.vpc_id
  peer_vpc_id   = module.secondary_cluster.vpc_id
  peer_region   = "us-west-2"
  auto_accept   = false
  
  tags = {
    Name = "primary-secondary-peering"
  }
}

resource "aws_vpc_peering_connection_accepter" "primary_secondary" {
  provider = aws.us_west_2
  
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
  auto_accept               = true
  
  tags = {
    Name = "primary-secondary-peering-accepter"
  }
}

# VPC Peering between Primary and Edge regions
resource "aws_vpc_peering_connection" "primary_edge" {
  provider = aws.us_east_1
  
  vpc_id        = module.primary_cluster.vpc_id
  peer_vpc_id   = module.edge_cluster.vpc_id
  peer_region   = "eu-west-1"
  auto_accept   = false
  
  tags = {
    Name = "primary-edge-peering"
  }
}

resource "aws_vpc_peering_connection_accepter" "primary_edge" {
  provider = aws.eu_west_1
  
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
  auto_accept               = true
  
  tags = {
    Name = "primary-edge-peering-accepter"
  }
}

# Route table updates for peering connections
resource "aws_route" "primary_to_secondary" {
  provider = aws.us_east_1
  
  count                     = length(module.primary_cluster.private_route_table_ids)
  route_table_id            = module.primary_cluster.private_route_table_ids[count.index]
  destination_cidr_block    = "10.1.0.0/16"
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
}

resource "aws_route" "secondary_to_primary" {
  provider = aws.us_west_2
  
  count                     = length(module.secondary_cluster.private_route_table_ids)
  route_table_id            = module.secondary_cluster.private_route_table_ids[count.index]
  destination_cidr_block    = "10.0.0.0/16"
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
}

resource "aws_route" "primary_to_edge" {
  provider = aws.us_east_1
  
  count                     = length(module.primary_cluster.private_route_table_ids)
  route_table_id            = module.primary_cluster.private_route_table_ids[count.index]
  destination_cidr_block    = "10.2.0.0/16"
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
}

resource "aws_route" "edge_to_primary" {
  provider = aws.eu_west_1
  
  count                     = length(module.edge_cluster.private_route_table_ids)
  route_table_id            = module.edge_cluster.private_route_table_ids[count.index]
  destination_cidr_block    = "10.0.0.0/16"
  vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
}

# Security group rules for cross-region communication
resource "aws_security_group_rule" "primary_allow_secondary" {
  provider = aws.us_east_1
  
  type              = "ingress"
  from_port         = 0
  to_port           = 65535
  protocol          = "tcp"
  cidr_blocks       = ["10.1.0.0/16"]
  security_group_id = module.primary_cluster.cluster_security_group_id
}

resource "aws_security_group_rule" "secondary_allow_primary" {
  provider = aws.us_west_2
  
  type              = "ingress"
  from_port         = 0
  to_port           = 65535
  protocol          = "tcp"
  cidr_blocks       = ["10.0.0.0/16"]
  security_group_id = module.secondary_cluster.cluster_security_group_id
}

resource "aws_security_group_rule" "primary_allow_edge" {
  provider = aws.us_east_1
  
  type              = "ingress"
  from_port         = 0
  to_port           = 65535
  protocol          = "tcp"
  cidr_blocks       = ["10.2.0.0/16"]
  security_group_id = module.primary_cluster.cluster_security_group_id
}

resource "aws_security_group_rule" "edge_allow_primary" {
  provider = aws.eu_west_1
  
  type              = "ingress"
  from_port         = 0
  to_port           = 65535
  protocol          = "tcp"
  cidr_blocks       = ["10.0.0.0/16"]
  security_group_id = module.edge_cluster.cluster_security_group_id
}

3Service Mesh Configuration

Deploy Istio service mesh for cross-region service communication and security

Istio Multi-Cluster Setup

# istio-setup.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: istio-system
  labels:
    istio-injection: disabled
---
apiVersion: v1
kind: Namespace
metadata:
  name: istio-gateway
  labels:
    istio-injection: enabled
---
# Istio Control Plane for Primary Cluster
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
metadata:
  name: primary-cluster
  namespace: istio-system
spec:
  values:
    global:
      meshID: mesh1
      network: primary-network
      externalIstiod: false
    pilot:
      env:
        EXTERNAL_ISTIOD: false
        ENABLE_CROSS_CLUSTER_WORKLOAD_ENTRY: true
  components:
    pilot:
      k8s:
        env:
          - name: PILOT_ENABLE_WORKLOAD_ENTRY_AUTOREGISTRATION
            value: "true"
          - name: PILOT_ENABLE_CROSS_CLUSTER_WORKLOAD_ENTRY
            value: "true"
---
# Gateway for cross-cluster communication
apiVersion: networking.istio.io/v1beta1
kind: Gateway
metadata:
  name: cross-network-gateway
  namespace: istio-system
spec:
  selector:
    istio: eastwestgateway
  servers:
    - port:
        number: 15021
        name: status-port
        protocol: TLS
      tls:
        mode: ISTIO_MUTUAL
      hosts:
        - cross-network-gateway.istio-system.svc.cluster.local
---
# East-West Gateway Service
apiVersion: v1
kind: Service
metadata:
  name: istio-eastwestgateway
  namespace: istio-system
  labels:
    istio: eastwestgateway
    app: istio-eastwestgateway
spec:
  type: LoadBalancer
  selector:
    istio: eastwestgateway
  ports:
    - port: 15021
      targetPort: 15021
      name: status-port
    - port: 15010
      targetPort: 15010
      name: tls
    - port: 15011
      targetPort: 15011
      name: tls-istiod
    - port: 15012
      targetPort: 15012
      name: tls-istiodwebhook
---
# Deployment for East-West Gateway
apiVersion: apps/v1
kind: Deployment
metadata:
  name: istio-eastwestgateway
  namespace: istio-system
spec:
  selector:
    matchLabels:
      istio: eastwestgateway
      app: istio-eastwestgateway
  template:
    metadata:
      annotations:
        inject.istio.io/templates: gateway
      labels:
        istio: eastwestgateway
        app: istio-eastwestgateway
    spec:
      containers:
        - name: istio-proxy
          image: auto
          resources:
            limits:
              cpu: 2000m
              memory: 1024Mi
            requests:
              cpu: 100m
              memory: 128Mi
      serviceAccountName: istio-eastwestgateway-service-account
---
# Service Account for East-West Gateway
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-eastwestgateway-service-account
  namespace: istio-system
  labels:
    istio: eastwestgateway
---
# Role for East-West Gateway
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: istio-eastwestgateway-sds
  namespace: istio-system
rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["get", "watch", "list"]
---
# RoleBinding for East-West Gateway
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: istio-eastwestgateway-sds
  namespace: istio-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: istio-eastwestgateway-sds
subjects:
  - kind: ServiceAccount
    name: istio-eastwestgateway-service-account

Cross-Cluster Service Discovery

# cross-cluster-discovery.yaml
# Secret for remote cluster access
apiVersion: v1
kind: Secret
metadata:
  name: istio-remote-secret-secondary
  namespace: istio-system
  labels:
    istio/cluster: secondary-cluster
type: Opaque
data:
  secondary-cluster: |
    apiVersion: v1
    kind: Config
    clusters:
    - cluster:
        certificate-authority-data: LS0tLS1CRUdJTi...
        server: https://secondary-cluster-endpoint
      name: secondary-cluster
    contexts:
    - context:
        cluster: secondary-cluster
        user: secondary-cluster
      name: secondary-cluster
    current-context: secondary-cluster
    users:
    - name: secondary-cluster
      user:
        token: eyJhbGciOiJSUzI1NiIsImtpZCI6...
---
# Secret for edge cluster access
apiVersion: v1
kind: Secret
metadata:
  name: istio-remote-secret-edge
  namespace: istio-system
  labels:
    istio/cluster: edge-cluster
type: Opaque
data:
  edge-cluster: |
    apiVersion: v1
    kind: Config
    clusters:
    - cluster:
        certificate-authority-data: LS0tLS1CRUdJTi...
        server: https://edge-cluster-endpoint
      name: edge-cluster
    contexts:
    - context:
        cluster: edge-cluster
        user: edge-cluster
      name: edge-cluster
    current-context: edge-cluster
    users:
    - name: edge-cluster
      user:
        token: eyJhbGciOiJSUzI1NiIsImtpZCI6...
---
# Network configuration for multi-cluster
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-multi-network
  namespace: istio-system
data:
  networks: |
    networks:
      primary-network:
        endpoints:
        - fromRegistry: primary-cluster
        gateways:
        - registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
          port: 15021
      secondary-network:
        endpoints:
        - fromRegistry: secondary-cluster
        gateways:
        - registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
          port: 15021
      edge-network:
        endpoints:
        - fromRegistry: edge-cluster
        gateways:
        - registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
          port: 15021

4Multi-Region Application Deployment

Deploy applications across regions with proper load balancing and failover

Application Deployment Manifest

# multi-region-app.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: production
  labels:
    istio-injection: enabled
---
# ConfigMap for application configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: app-config
  namespace: production
data:
  region: "primary"
  database_url: "postgresql://primary-db:5432/app"
  redis_url: "redis://primary-redis:6379"
  log_level: "info"
---
# Secret for database credentials
apiVersion: v1
kind: Secret
metadata:
  name: app-secrets
  namespace: production
type: Opaque
data:
  db_password: cGFzc3dvcmQxMjM=
  redis_password: cmVkaXNwYXNzd29yZA==
  jwt_secret: and0c2VjcmV0a2V5MTIz
---
# Deployment for primary region
apiVersion: apps/v1
kind: Deployment
metadata:
  name: web-app-primary
  namespace: production
  labels:
    app: web-app
    version: v1
    region: primary
spec:
  replicas: 3
  selector:
    matchLabels:
      app: web-app
      version: v1
      region: primary
  template:
    metadata:
      labels:
        app: web-app
        version: v1
        region: primary
    spec:
      containers:
      - name: web-app
        image: nginx:1.21-alpine
        ports:
        - containerPort: 80
        env:
        - name: REGION
          valueFrom:
            configMapKeyRef:
              name: app-config
              key: region
        - name: DATABASE_URL
          valueFrom:
            configMapKeyRef:
              name: app-config
              key: database_url
        - name: DB_PASSWORD
          valueFrom:
            secretKeyRef:
              name: app-secrets
              key: db_password
        resources:
          requests:
            memory: "128Mi"
            cpu: "100m"
          limits:
            memory: "256Mi"
            cpu: "200m"
        livenessProbe:
          httpGet:
            path: /health
            port: 80
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /ready
            port: 80
          initialDelaySeconds: 5
          periodSeconds: 5
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - web-app
              topologyKey: kubernetes.io/hostname
---
# Service for the application
apiVersion: v1
kind: Service
metadata:
  name: web-app-service
  namespace: production
  labels:
    app: web-app
spec:
  selector:
    app: web-app
  ports:
  - port: 80
    targetPort: 80
    name: http
  type: ClusterIP
---
# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: web-app-hpa
  namespace: production
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: web-app-primary
  minReplicas: 3
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
---
# Virtual Service for traffic management
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: web-app-vs
  namespace: production
spec:
  hosts:
  - web-app-service
  http:
  - match:
    - headers:
        region:
          exact: primary
    route:
    - destination:
        host: web-app-service
        subset: primary
      weight: 100
  - route:
    - destination:
        host: web-app-service
        subset: primary
      weight: 80
    - destination:
        host: web-app-service.production.svc.cluster.local
        subset: secondary
      weight: 20
---
# Destination Rule for load balancing
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
  name: web-app-dr
  namespace: production
spec:
  host: web-app-service
  trafficPolicy:
    loadBalancer:
      simple: LEAST_CONN
    connectionPool:
      tcp:
        maxConnections: 100
      http:
        http1MaxPendingRequests: 50
        maxRequestsPerConnection: 10
    circuitBreaker:
      consecutiveErrors: 3
      interval: 30s
      baseEjectionTime: 30s
      maxEjectionPercent: 50
  subsets:
  - name: primary
    labels:
      region: primary
  - name: secondary
    labels:
      region: secondary

5Monitoring and Observability

Set up comprehensive monitoring across all regions with Prometheus, Grafana, and Jaeger

Prometheus Multi-Cluster Setup

# monitoring-setup.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
---
# Prometheus Configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitoring
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      evaluation_interval: 15s
      external_labels:
        cluster: 'primary-cluster'
        region: 'us-east-1'
    
    rule_files:
      - "/etc/prometheus/rules/*.yml"
    
    scrape_configs:
      - job_name: 'kubernetes-apiservers'
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
          action: keep
          regex: default;kubernetes;https
      
      - job_name: 'kubernetes-nodes'
        kubernetes_sd_configs:
        - role: node
        scheme: https
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
        - target_label: __address__
          replacement: kubernetes.default.svc:443
        - source_labels: [__meta_kubernetes_node_name]
          regex: (.+)
          target_label: __metrics_path__
          replacement: /api/v1/nodes/${1}/proxy/metrics
      
      - job_name: 'kubernetes-pods'
        kubernetes_sd_configs:
        - role: pod
        relabel_configs:
        - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
          action: keep
          regex: true
        - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
          action: replace
          target_label: __metrics_path__
          regex: (.+)
        - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
          action: replace
          regex: ([^:]+)(?::d+)?;(d+)
          replacement: $1:$2
          target_label: __address__
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_pod_name]
          action: replace
          target_label: kubernetes_pod_name
      
      - job_name: 'istio-mesh'
        kubernetes_sd_configs:
        - role: endpoints
          namespaces:
            names:
            - istio-system
        relabel_configs:
        - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
          action: keep
          regex: istio-telemetry;prometheus
      
      - job_name: 'federate-secondary'
        scrape_interval: 15s
        honor_labels: true
        metrics_path: '/federate'
        params:
          'match[]':
            - '{job=~"kubernetes.*"}'
            - '{job=~"istio.*"}'
        static_configs:
          - targets:
            - 'prometheus.monitoring.svc.cluster.local:9090'
        relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: secondary-prometheus-endpoint:9090
    
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - alertmanager:9093
---
# Prometheus Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      - name: prometheus
        image: prom/prometheus:v2.45.0
        args:
          - '--config.file=/etc/prometheus/prometheus.yml'
          - '--storage.tsdb.path=/prometheus/'
          - '--web.console.libraries=/etc/prometheus/console_libraries'
          - '--web.console.templates=/etc/prometheus/consoles'
          - '--storage.tsdb.retention.time=200h'
          - '--web.enable-lifecycle'
          - '--web.enable-admin-api'
        ports:
        - containerPort: 9090
        resources:
          requests:
            cpu: 500m
            memory: 500M
          limits:
            cpu: 1
            memory: 1Gi
        volumeMounts:
        - name: prometheus-config-volume
          mountPath: /etc/prometheus/
        - name: prometheus-storage-volume
          mountPath: /prometheus/
      volumes:
      - name: prometheus-config-volume
        configMap:
          defaultMode: 420
          name: prometheus-config
      - name: prometheus-storage-volume
        emptyDir: {}
---
# Prometheus Service
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: monitoring
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: '9090'
spec:
  selector:
    app: prometheus
  type: ClusterIP
  ports:
    - port: 9090
      targetPort: 9090
---
# Grafana Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
      - name: grafana
        image: grafana/grafana:10.0.0
        ports:
        - containerPort: 3000
        resources:
          requests:
            cpu: 100m
            memory: 128Mi
          limits:
            cpu: 200m
            memory: 256Mi
        env:
        - name: GF_SECURITY_ADMIN_PASSWORD
          value: "admin123"
        - name: GF_INSTALL_PLUGINS
          value: "grafana-kubernetes-app"
        volumeMounts:
        - name: grafana-storage
          mountPath: /var/lib/grafana
      volumes:
      - name: grafana-storage
        emptyDir: {}
---
# Grafana Service
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: monitoring
spec:
  selector:
    app: grafana
  type: LoadBalancer
  ports:
    - port: 3000
      targetPort: 3000

6Disaster Recovery Setup

Configure automated failover and data replication between regions

Disaster Recovery Configuration

# disaster-recovery.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: disaster-recovery
---
# Velero Backup Configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: velero-config
  namespace: disaster-recovery
data:
  backup-schedule.yaml: |
    apiVersion: velero.io/v1
    kind: Schedule
    metadata:
      name: daily-backup
      namespace: velero
    spec:
      schedule: "0 2 * * *"
      template:
        includedNamespaces:
        - production
        - monitoring
        - istio-system
        storageLocation: default
        volumeSnapshotLocations:
        - default
        ttl: 720h0m0s
---
# Database Replication Job
apiVersion: batch/v1
kind: CronJob
metadata:
  name: database-replication
  namespace: disaster-recovery
spec:
  schedule: "*/15 * * * *"
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: db-replication
            image: postgres:15-alpine
            command:
            - /bin/bash
            - -c
            - |
              # Primary to Secondary replication
              pg_dump -h primary-db.production.svc.cluster.local                       -U postgres                       -d app_db                       --no-owner                       --no-privileges                       | psql -h secondary-db-endpoint                              -U postgres                              -d app_db
              
              # Verify replication status
              REPLICATION_LAG=$(psql -h secondary-db-endpoint                                     -U postgres                                     -d app_db                                     -t -c "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))")
              
              if (( $(echo "$REPLICATION_LAG > 300" | bc -l) )); then
                echo "ALERT: Replication lag is $REPLICATION_LAG seconds"
                exit 1
              fi
            env:
            - name: PGPASSWORD
              valueFrom:
                secretKeyRef:
                  name: db-credentials
                  key: password
          restartPolicy: OnFailure
---
# Failover Script ConfigMap
apiVersion: v1
kind: ConfigMap
metadata:
  name: failover-scripts
  namespace: disaster-recovery
data:
  failover.sh: |
    #!/bin/bash
    set -e
    
    echo "Starting disaster recovery failover..."
    
    # Update DNS to point to secondary region
    aws route53 change-resource-record-sets       --hosted-zone-id Z123456789       --change-batch '{
        "Changes": [{
          "Action": "UPSERT",
          "ResourceRecordSet": {
            "Name": "app.example.com",
            "Type": "CNAME",
            "TTL": 60,
            "ResourceRecords": [{"Value": "secondary-lb.us-west-2.elb.amazonaws.com"}]
          }
        }]
      }'
    
    # Scale up secondary region
    kubectl --context=secondary-cluster       scale deployment web-app-secondary       --namespace=production       --replicas=6
    
    # Update Istio traffic routing
    kubectl --context=secondary-cluster apply -f - <<EOF
    apiVersion: networking.istio.io/v1beta1
    kind: VirtualService
    metadata:
      name: web-app-vs
      namespace: production
    spec:
      hosts:
      - web-app-service
      http:
      - route:
        - destination:
            host: web-app-service
            subset: secondary
          weight: 100
    EOF
    
    # Promote secondary database to primary
    psql -h secondary-db-endpoint          -U postgres          -d app_db          -c "SELECT pg_promote();"
    
    echo "Failover completed successfully"
  
  health-check.sh: |
    #!/bin/bash
    
    # Check primary region health
    PRIMARY_HEALTH=$(curl -s -o /dev/null -w "%{http_code}"       https://primary-app.example.com/health || echo "000")
    
    # Check secondary region health
    SECONDARY_HEALTH=$(curl -s -o /dev/null -w "%{http_code}"       https://secondary-app.example.com/health || echo "000")
    
    echo "Primary health: $PRIMARY_HEALTH"
    echo "Secondary health: $SECONDARY_HEALTH"
    
    # Trigger failover if primary is down and secondary is healthy
    if [[ "$PRIMARY_HEALTH" != "200" && "$SECONDARY_HEALTH" == "200" ]]; then
      echo "Primary region is unhealthy, initiating failover..."
      /scripts/failover.sh
    fi
---
# Health Check CronJob
apiVersion: batch/v1
kind: CronJob
metadata:
  name: health-check
  namespace: disaster-recovery
spec:
  schedule: "*/2 * * * *"
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: health-checker
            image: curlimages/curl:8.1.0
            command:
            - /bin/sh
            - /scripts/health-check.sh
            volumeMounts:
            - name: scripts
              mountPath: /scripts
          volumes:
          - name: scripts
            configMap:
              name: failover-scripts
              defaultMode: 0755
          restartPolicy: OnFailure
---
# Alerting Rules for Disaster Recovery
apiVersion: v1
kind: ConfigMap
metadata:
  name: dr-alerting-rules
  namespace: monitoring
data:
  disaster-recovery.yml: |
    groups:
    - name: disaster-recovery
      rules:
      - alert: PrimaryRegionDown
        expr: up{job="kubernetes-apiservers",cluster="primary-cluster"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Primary region is down"
          description: "Primary Kubernetes cluster has been down for more than 5 minutes"
      
      - alert: DatabaseReplicationLag
        expr: postgres_replication_lag_seconds > 300
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Database replication lag is high"
          description: "Database replication lag is {{ $value }} seconds"
      
      - alert: CrossRegionConnectivityLoss
        expr: probe_success{job="cross-region-probe"} == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: "Cross-region connectivity lost"
          description: "Unable to reach secondary region from primary"

Deployment Commands

Step-by-step commands to deploy your multi-region Kubernetes infrastructure

1. Initialize Terraform

terraform init

2. Plan Infrastructure

terraform plan -out=tfplan

3. Deploy Infrastructure

terraform apply tfplan

4. Configure kubectl contexts

aws eks update-kubeconfig --region us-east-1 --name primary-k8s-cluster --alias primary
aws eks update-kubeconfig --region us-west-2 --name secondary-k8s-cluster --alias secondary
aws eks update-kubeconfig --region eu-west-1 --name edge-k8s-cluster --alias edge

5. Install Istio

# Install Istio on primary cluster
kubectl --context=primary apply -f istio-setup.yaml

# Install Istio on secondary cluster
kubectl --context=secondary apply -f istio-setup.yaml

# Install Istio on edge cluster
kubectl --context=edge apply -f istio-setup.yaml

6. Deploy Applications

kubectl --context=primary apply -f multi-region-app.yaml
kubectl --context=secondary apply -f multi-region-app.yaml
kubectl --context=edge apply -f multi-region-app.yaml

7. Setup Monitoring

kubectl --context=primary apply -f monitoring-setup.yaml
kubectl --context=secondary apply -f monitoring-setup.yaml
kubectl --context=edge apply -f monitoring-setup.yaml

Troubleshooting

Cross-region connectivity issues

If services can't communicate across regions, check VPC peering connections and security groups.

# Check VPC peering status
aws ec2 describe-vpc-peering-connections

# Test connectivity
kubectl --context=primary exec -it test-pod -- nc -zv secondary-service.production.svc.cluster.local 80

Istio service mesh issues

If Istio isn't working properly, check the control plane status and proxy configurations.

# Check Istio status
istioctl proxy-status

# Check proxy configuration
istioctl proxy-config cluster <pod-name> -n production

Database replication lag

Monitor replication lag and adjust replication frequency if needed.

# Check replication lag
psql -h secondary-db -U postgres -d app_db -c "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) AS lag_seconds;"

Frequently Asked Questions

What are the benefits of multi-region Kubernetes deployments?

Multi-region Kubernetes deployments provide high availability, disaster recovery, reduced latency for global users, compliance with data residency requirements, and protection against regional outages.

How do you handle data consistency across regions?

Data consistency is managed through distributed databases, cross-region replication, eventual consistency patterns, and careful application design with proper data partitioning strategies.

What networking considerations are important for multi-region K8s?

Key networking considerations include cross-region connectivity, service mesh configuration, ingress controllers, DNS management, and network security policies across regions.

How do you monitor multi-region Kubernetes clusters?

Monitoring involves centralized observability platforms, distributed tracing, cross-region metrics aggregation, alerting systems, and comprehensive logging across all clusters.

What are the cost implications of multi-region deployments?

Costs include additional infrastructure, cross-region data transfer, storage replication, and operational overhead. However, these are offset by improved availability and performance.

Next Steps

Security Enhancements

Implement Pod Security Standards
Add network policies
Enable audit logging
Implement RBAC policies

Operational Improvements

Set up GitOps with ArgoCD
Implement chaos engineering
Add cost monitoring
Automate certificate management

Related Guides

AWS EC2 Terraform

Deploy single-region infrastructure on AWS

Azure ARM Templates

Deploy infrastructure on Microsoft Azure

GCP Deployment Manager

Deploy infrastructure on Google Cloud

Ready to Deploy Multi-Region Kubernetes?

Get the complete configuration package with all Terraform files, Kubernetes manifests, and deployment scripts.