Multi-Region Kubernetes Deployment
Deploy production-ready Kubernetes clusters across multiple regions with high availability, disaster recovery, and enterprise-grade security hardening.

RFS
Senior Penetration Tester | eJPT, eCPPTv2, CRTP, ADCS CESP
Primary Region (US-East)
Main Kubernetes cluster with control plane and worker nodes
Secondary Region (US-West)
Disaster recovery cluster with data replication
Edge Region (EU-West)
Edge cluster for low-latency European traffic
Required Tools
- Terraform ≥ 1.5.0
- kubectl ≥ 1.28.0
- Helm ≥ 3.12.0
- AWS CLI v2 or GCP CLI
Cloud Permissions
- EKS/GKE cluster management
- VPC and networking
- IAM role management
- Load balancer configuration
Main Terraform Configuration
# main.tf
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.11"
}
}
}
# Provider configurations for multiple regions
provider "aws" {
alias = "us_east_1"
region = "us-east-1"
}
provider "aws" {
alias = "us_west_2"
region = "us-west-2"
}
provider "aws" {
alias = "eu_west_1"
region = "eu-west-1"
}
# Primary region EKS cluster
module "primary_cluster" {
source = "./modules/eks-cluster"
providers = {
aws = aws.us_east_1
}
cluster_name = "primary-k8s-cluster"
cluster_version = "1.28"
region = "us-east-1"
vpc_cidr = "10.0.0.0/16"
node_groups = {
system = {
instance_types = ["t3.medium"]
min_size = 2
max_size = 4
desired_size = 2
labels = {
role = "system"
}
taints = [{
key = "CriticalAddonsOnly"
value = "true"
effect = "NO_SCHEDULE"
}]
}
application = {
instance_types = ["t3.large"]
min_size = 3
max_size = 10
desired_size = 3
labels = {
role = "application"
}
}
}
tags = {
Environment = "production"
Region = "primary"
Terraform = "true"
}
}
# Secondary region EKS cluster (DR)
module "secondary_cluster" {
source = "./modules/eks-cluster"
providers = {
aws = aws.us_west_2
}
cluster_name = "secondary-k8s-cluster"
cluster_version = "1.28"
region = "us-west-2"
vpc_cidr = "10.1.0.0/16"
node_groups = {
system = {
instance_types = ["t3.medium"]
min_size = 1
max_size = 3
desired_size = 1
labels = {
role = "system"
}
}
application = {
instance_types = ["t3.large"]
min_size = 2
max_size = 8
desired_size = 2
labels = {
role = "application"
}
}
}
tags = {
Environment = "production"
Region = "secondary"
Terraform = "true"
}
}
# Edge region EKS cluster
module "edge_cluster" {
source = "./modules/eks-cluster"
providers = {
aws = aws.eu_west_1
}
cluster_name = "edge-k8s-cluster"
cluster_version = "1.28"
region = "eu-west-1"
vpc_cidr = "10.2.0.0/16"
node_groups = {
edge = {
instance_types = ["t3.medium"]
min_size = 2
max_size = 6
desired_size = 2
labels = {
role = "edge"
}
}
}
tags = {
Environment = "production"
Region = "edge"
Terraform = "true"
}
}
EKS Cluster Module
# modules/eks-cluster/main.tf
data "aws_availability_zones" "available" {
state = "available"
}
# VPC Configuration
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(var.tags, {
Name = "${var.cluster_name}-vpc"
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
})
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = merge(var.tags, {
Name = "${var.cluster_name}-igw"
})
}
# Public Subnets
resource "aws_subnet" "public" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index)
availability_zone = data.aws_availability_zones.available.names[count.index]
map_public_ip_on_launch = true
tags = merge(var.tags, {
Name = "${var.cluster_name}-public-${count.index + 1}"
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
"kubernetes.io/role/elb" = "1"
})
}
# Private Subnets
resource "aws_subnet" "private" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index + 10)
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = merge(var.tags, {
Name = "${var.cluster_name}-private-${count.index + 1}"
"kubernetes.io/cluster/${var.cluster_name}" = "owned"
"kubernetes.io/role/internal-elb" = "1"
})
}
# NAT Gateways
resource "aws_eip" "nat" {
count = 3
domain = "vpc"
tags = merge(var.tags, {
Name = "${var.cluster_name}-nat-eip-${count.index + 1}"
})
}
resource "aws_nat_gateway" "main" {
count = 3
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = merge(var.tags, {
Name = "${var.cluster_name}-nat-${count.index + 1}"
})
depends_on = [aws_internet_gateway.main]
}
# Route Tables
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-public-rt"
})
}
resource "aws_route_table" "private" {
count = 3
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-private-rt-${count.index + 1}"
})
}
# Route Table Associations
resource "aws_route_table_association" "public" {
count = 3
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
resource "aws_route_table_association" "private" {
count = 3
subnet_id = aws_subnet.private[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
# EKS Cluster IAM Role
resource "aws_iam_role" "cluster" {
name = "${var.cluster_name}-cluster-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "eks.amazonaws.com"
}
}
]
})
tags = var.tags
}
resource "aws_iam_role_policy_attachment" "cluster_AmazonEKSClusterPolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.cluster.name
}
# EKS Cluster Security Group
resource "aws_security_group" "cluster" {
name_prefix = "${var.cluster_name}-cluster-sg"
vpc_id = aws_vpc.main.id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-cluster-sg"
})
}
# EKS Cluster
resource "aws_eks_cluster" "main" {
name = var.cluster_name
role_arn = aws_iam_role.cluster.arn
version = var.cluster_version
vpc_config {
subnet_ids = concat(aws_subnet.private[*].id, aws_subnet.public[*].id)
endpoint_private_access = true
endpoint_public_access = true
public_access_cidrs = ["0.0.0.0/0"]
security_group_ids = [aws_security_group.cluster.id]
}
encryption_config {
provider {
key_arn = aws_kms_key.eks.arn
}
resources = ["secrets"]
}
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
depends_on = [
aws_iam_role_policy_attachment.cluster_AmazonEKSClusterPolicy,
aws_cloudwatch_log_group.cluster
]
tags = var.tags
}
# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "cluster" {
name = "/aws/eks/${var.cluster_name}/cluster"
retention_in_days = 7
tags = var.tags
}
# KMS Key for EKS encryption
resource "aws_kms_key" "eks" {
description = "EKS Secret Encryption Key for ${var.cluster_name}"
deletion_window_in_days = 7
tags = var.tags
}
resource "aws_kms_alias" "eks" {
name = "alias/eks-${var.cluster_name}"
target_key_id = aws_kms_key.eks.key_id
}
VPC Peering Configuration
# networking.tf
# VPC Peering between Primary and Secondary regions
resource "aws_vpc_peering_connection" "primary_secondary" {
provider = aws.us_east_1
vpc_id = module.primary_cluster.vpc_id
peer_vpc_id = module.secondary_cluster.vpc_id
peer_region = "us-west-2"
auto_accept = false
tags = {
Name = "primary-secondary-peering"
}
}
resource "aws_vpc_peering_connection_accepter" "primary_secondary" {
provider = aws.us_west_2
vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
auto_accept = true
tags = {
Name = "primary-secondary-peering-accepter"
}
}
# VPC Peering between Primary and Edge regions
resource "aws_vpc_peering_connection" "primary_edge" {
provider = aws.us_east_1
vpc_id = module.primary_cluster.vpc_id
peer_vpc_id = module.edge_cluster.vpc_id
peer_region = "eu-west-1"
auto_accept = false
tags = {
Name = "primary-edge-peering"
}
}
resource "aws_vpc_peering_connection_accepter" "primary_edge" {
provider = aws.eu_west_1
vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
auto_accept = true
tags = {
Name = "primary-edge-peering-accepter"
}
}
# Route table updates for peering connections
resource "aws_route" "primary_to_secondary" {
provider = aws.us_east_1
count = length(module.primary_cluster.private_route_table_ids)
route_table_id = module.primary_cluster.private_route_table_ids[count.index]
destination_cidr_block = "10.1.0.0/16"
vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
}
resource "aws_route" "secondary_to_primary" {
provider = aws.us_west_2
count = length(module.secondary_cluster.private_route_table_ids)
route_table_id = module.secondary_cluster.private_route_table_ids[count.index]
destination_cidr_block = "10.0.0.0/16"
vpc_peering_connection_id = aws_vpc_peering_connection.primary_secondary.id
}
resource "aws_route" "primary_to_edge" {
provider = aws.us_east_1
count = length(module.primary_cluster.private_route_table_ids)
route_table_id = module.primary_cluster.private_route_table_ids[count.index]
destination_cidr_block = "10.2.0.0/16"
vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
}
resource "aws_route" "edge_to_primary" {
provider = aws.eu_west_1
count = length(module.edge_cluster.private_route_table_ids)
route_table_id = module.edge_cluster.private_route_table_ids[count.index]
destination_cidr_block = "10.0.0.0/16"
vpc_peering_connection_id = aws_vpc_peering_connection.primary_edge.id
}
# Security group rules for cross-region communication
resource "aws_security_group_rule" "primary_allow_secondary" {
provider = aws.us_east_1
type = "ingress"
from_port = 0
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.1.0.0/16"]
security_group_id = module.primary_cluster.cluster_security_group_id
}
resource "aws_security_group_rule" "secondary_allow_primary" {
provider = aws.us_west_2
type = "ingress"
from_port = 0
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.0.0.0/16"]
security_group_id = module.secondary_cluster.cluster_security_group_id
}
resource "aws_security_group_rule" "primary_allow_edge" {
provider = aws.us_east_1
type = "ingress"
from_port = 0
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.2.0.0/16"]
security_group_id = module.primary_cluster.cluster_security_group_id
}
resource "aws_security_group_rule" "edge_allow_primary" {
provider = aws.eu_west_1
type = "ingress"
from_port = 0
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.0.0.0/16"]
security_group_id = module.edge_cluster.cluster_security_group_id
}
Istio Multi-Cluster Setup
# istio-setup.yaml
apiVersion: v1
kind: Namespace
metadata:
name: istio-system
labels:
istio-injection: disabled
---
apiVersion: v1
kind: Namespace
metadata:
name: istio-gateway
labels:
istio-injection: enabled
---
# Istio Control Plane for Primary Cluster
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
metadata:
name: primary-cluster
namespace: istio-system
spec:
values:
global:
meshID: mesh1
network: primary-network
externalIstiod: false
pilot:
env:
EXTERNAL_ISTIOD: false
ENABLE_CROSS_CLUSTER_WORKLOAD_ENTRY: true
components:
pilot:
k8s:
env:
- name: PILOT_ENABLE_WORKLOAD_ENTRY_AUTOREGISTRATION
value: "true"
- name: PILOT_ENABLE_CROSS_CLUSTER_WORKLOAD_ENTRY
value: "true"
---
# Gateway for cross-cluster communication
apiVersion: networking.istio.io/v1beta1
kind: Gateway
metadata:
name: cross-network-gateway
namespace: istio-system
spec:
selector:
istio: eastwestgateway
servers:
- port:
number: 15021
name: status-port
protocol: TLS
tls:
mode: ISTIO_MUTUAL
hosts:
- cross-network-gateway.istio-system.svc.cluster.local
---
# East-West Gateway Service
apiVersion: v1
kind: Service
metadata:
name: istio-eastwestgateway
namespace: istio-system
labels:
istio: eastwestgateway
app: istio-eastwestgateway
spec:
type: LoadBalancer
selector:
istio: eastwestgateway
ports:
- port: 15021
targetPort: 15021
name: status-port
- port: 15010
targetPort: 15010
name: tls
- port: 15011
targetPort: 15011
name: tls-istiod
- port: 15012
targetPort: 15012
name: tls-istiodwebhook
---
# Deployment for East-West Gateway
apiVersion: apps/v1
kind: Deployment
metadata:
name: istio-eastwestgateway
namespace: istio-system
spec:
selector:
matchLabels:
istio: eastwestgateway
app: istio-eastwestgateway
template:
metadata:
annotations:
inject.istio.io/templates: gateway
labels:
istio: eastwestgateway
app: istio-eastwestgateway
spec:
containers:
- name: istio-proxy
image: auto
resources:
limits:
cpu: 2000m
memory: 1024Mi
requests:
cpu: 100m
memory: 128Mi
serviceAccountName: istio-eastwestgateway-service-account
---
# Service Account for East-West Gateway
apiVersion: v1
kind: ServiceAccount
metadata:
name: istio-eastwestgateway-service-account
namespace: istio-system
labels:
istio: eastwestgateway
---
# Role for East-West Gateway
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: istio-eastwestgateway-sds
namespace: istio-system
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get", "watch", "list"]
---
# RoleBinding for East-West Gateway
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: istio-eastwestgateway-sds
namespace: istio-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: istio-eastwestgateway-sds
subjects:
- kind: ServiceAccount
name: istio-eastwestgateway-service-account
Cross-Cluster Service Discovery
# cross-cluster-discovery.yaml
# Secret for remote cluster access
apiVersion: v1
kind: Secret
metadata:
name: istio-remote-secret-secondary
namespace: istio-system
labels:
istio/cluster: secondary-cluster
type: Opaque
data:
secondary-cluster: |
apiVersion: v1
kind: Config
clusters:
- cluster:
certificate-authority-data: LS0tLS1CRUdJTi...
server: https://secondary-cluster-endpoint
name: secondary-cluster
contexts:
- context:
cluster: secondary-cluster
user: secondary-cluster
name: secondary-cluster
current-context: secondary-cluster
users:
- name: secondary-cluster
user:
token: eyJhbGciOiJSUzI1NiIsImtpZCI6...
---
# Secret for edge cluster access
apiVersion: v1
kind: Secret
metadata:
name: istio-remote-secret-edge
namespace: istio-system
labels:
istio/cluster: edge-cluster
type: Opaque
data:
edge-cluster: |
apiVersion: v1
kind: Config
clusters:
- cluster:
certificate-authority-data: LS0tLS1CRUdJTi...
server: https://edge-cluster-endpoint
name: edge-cluster
contexts:
- context:
cluster: edge-cluster
user: edge-cluster
name: edge-cluster
current-context: edge-cluster
users:
- name: edge-cluster
user:
token: eyJhbGciOiJSUzI1NiIsImtpZCI6...
---
# Network configuration for multi-cluster
apiVersion: v1
kind: ConfigMap
metadata:
name: istio-multi-network
namespace: istio-system
data:
networks: |
networks:
primary-network:
endpoints:
- fromRegistry: primary-cluster
gateways:
- registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
port: 15021
secondary-network:
endpoints:
- fromRegistry: secondary-cluster
gateways:
- registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
port: 15021
edge-network:
endpoints:
- fromRegistry: edge-cluster
gateways:
- registryServiceName: istio-eastwestgateway.istio-system.svc.cluster.local
port: 15021
Application Deployment Manifest
# multi-region-app.yaml
apiVersion: v1
kind: Namespace
metadata:
name: production
labels:
istio-injection: enabled
---
# ConfigMap for application configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: app-config
namespace: production
data:
region: "primary"
database_url: "postgresql://primary-db:5432/app"
redis_url: "redis://primary-redis:6379"
log_level: "info"
---
# Secret for database credentials
apiVersion: v1
kind: Secret
metadata:
name: app-secrets
namespace: production
type: Opaque
data:
db_password: cGFzc3dvcmQxMjM=
redis_password: cmVkaXNwYXNzd29yZA==
jwt_secret: and0c2VjcmV0a2V5MTIz
---
# Deployment for primary region
apiVersion: apps/v1
kind: Deployment
metadata:
name: web-app-primary
namespace: production
labels:
app: web-app
version: v1
region: primary
spec:
replicas: 3
selector:
matchLabels:
app: web-app
version: v1
region: primary
template:
metadata:
labels:
app: web-app
version: v1
region: primary
spec:
containers:
- name: web-app
image: nginx:1.21-alpine
ports:
- containerPort: 80
env:
- name: REGION
valueFrom:
configMapKeyRef:
name: app-config
key: region
- name: DATABASE_URL
valueFrom:
configMapKeyRef:
name: app-config
key: database_url
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: app-secrets
key: db_password
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /health
port: 80
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 80
initialDelaySeconds: 5
periodSeconds: 5
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- web-app
topologyKey: kubernetes.io/hostname
---
# Service for the application
apiVersion: v1
kind: Service
metadata:
name: web-app-service
namespace: production
labels:
app: web-app
spec:
selector:
app: web-app
ports:
- port: 80
targetPort: 80
name: http
type: ClusterIP
---
# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: web-app-hpa
namespace: production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: web-app-primary
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
---
# Virtual Service for traffic management
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: web-app-vs
namespace: production
spec:
hosts:
- web-app-service
http:
- match:
- headers:
region:
exact: primary
route:
- destination:
host: web-app-service
subset: primary
weight: 100
- route:
- destination:
host: web-app-service
subset: primary
weight: 80
- destination:
host: web-app-service.production.svc.cluster.local
subset: secondary
weight: 20
---
# Destination Rule for load balancing
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: web-app-dr
namespace: production
spec:
host: web-app-service
trafficPolicy:
loadBalancer:
simple: LEAST_CONN
connectionPool:
tcp:
maxConnections: 100
http:
http1MaxPendingRequests: 50
maxRequestsPerConnection: 10
circuitBreaker:
consecutiveErrors: 3
interval: 30s
baseEjectionTime: 30s
maxEjectionPercent: 50
subsets:
- name: primary
labels:
region: primary
- name: secondary
labels:
region: secondary
Prometheus Multi-Cluster Setup
# monitoring-setup.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
# Prometheus Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'primary-cluster'
region: 'us-east-1'
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::d+)?;(d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'istio-mesh'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-telemetry;prometheus
- job_name: 'federate-secondary'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job=~"kubernetes.*"}'
- '{job=~"istio.*"}'
static_configs:
- targets:
- 'prometheus.monitoring.svc.cluster.local:9090'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: secondary-prometheus-endpoint:9090
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
---
# Prometheus Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.45.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus/'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- containerPort: 9090
resources:
requests:
cpu: 500m
memory: 500M
limits:
cpu: 1
memory: 1Gi
volumeMounts:
- name: prometheus-config-volume
mountPath: /etc/prometheus/
- name: prometheus-storage-volume
mountPath: /prometheus/
volumes:
- name: prometheus-config-volume
configMap:
defaultMode: 420
name: prometheus-config
- name: prometheus-storage-volume
emptyDir: {}
---
# Prometheus Service
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9090'
spec:
selector:
app: prometheus
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
---
# Grafana Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:10.0.0
ports:
- containerPort: 3000
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
env:
- name: GF_SECURITY_ADMIN_PASSWORD
value: "admin123"
- name: GF_INSTALL_PLUGINS
value: "grafana-kubernetes-app"
volumeMounts:
- name: grafana-storage
mountPath: /var/lib/grafana
volumes:
- name: grafana-storage
emptyDir: {}
---
# Grafana Service
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
spec:
selector:
app: grafana
type: LoadBalancer
ports:
- port: 3000
targetPort: 3000
Disaster Recovery Configuration
# disaster-recovery.yaml
apiVersion: v1
kind: Namespace
metadata:
name: disaster-recovery
---
# Velero Backup Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: velero-config
namespace: disaster-recovery
data:
backup-schedule.yaml: |
apiVersion: velero.io/v1
kind: Schedule
metadata:
name: daily-backup
namespace: velero
spec:
schedule: "0 2 * * *"
template:
includedNamespaces:
- production
- monitoring
- istio-system
storageLocation: default
volumeSnapshotLocations:
- default
ttl: 720h0m0s
---
# Database Replication Job
apiVersion: batch/v1
kind: CronJob
metadata:
name: database-replication
namespace: disaster-recovery
spec:
schedule: "*/15 * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: db-replication
image: postgres:15-alpine
command:
- /bin/bash
- -c
- |
# Primary to Secondary replication
pg_dump -h primary-db.production.svc.cluster.local -U postgres -d app_db --no-owner --no-privileges | psql -h secondary-db-endpoint -U postgres -d app_db
# Verify replication status
REPLICATION_LAG=$(psql -h secondary-db-endpoint -U postgres -d app_db -t -c "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))")
if (( $(echo "$REPLICATION_LAG > 300" | bc -l) )); then
echo "ALERT: Replication lag is $REPLICATION_LAG seconds"
exit 1
fi
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: db-credentials
key: password
restartPolicy: OnFailure
---
# Failover Script ConfigMap
apiVersion: v1
kind: ConfigMap
metadata:
name: failover-scripts
namespace: disaster-recovery
data:
failover.sh: |
#!/bin/bash
set -e
echo "Starting disaster recovery failover..."
# Update DNS to point to secondary region
aws route53 change-resource-record-sets --hosted-zone-id Z123456789 --change-batch '{
"Changes": [{
"Action": "UPSERT",
"ResourceRecordSet": {
"Name": "app.example.com",
"Type": "CNAME",
"TTL": 60,
"ResourceRecords": [{"Value": "secondary-lb.us-west-2.elb.amazonaws.com"}]
}
}]
}'
# Scale up secondary region
kubectl --context=secondary-cluster scale deployment web-app-secondary --namespace=production --replicas=6
# Update Istio traffic routing
kubectl --context=secondary-cluster apply -f - <<EOF
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: web-app-vs
namespace: production
spec:
hosts:
- web-app-service
http:
- route:
- destination:
host: web-app-service
subset: secondary
weight: 100
EOF
# Promote secondary database to primary
psql -h secondary-db-endpoint -U postgres -d app_db -c "SELECT pg_promote();"
echo "Failover completed successfully"
health-check.sh: |
#!/bin/bash
# Check primary region health
PRIMARY_HEALTH=$(curl -s -o /dev/null -w "%{http_code}" https://primary-app.example.com/health || echo "000")
# Check secondary region health
SECONDARY_HEALTH=$(curl -s -o /dev/null -w "%{http_code}" https://secondary-app.example.com/health || echo "000")
echo "Primary health: $PRIMARY_HEALTH"
echo "Secondary health: $SECONDARY_HEALTH"
# Trigger failover if primary is down and secondary is healthy
if [[ "$PRIMARY_HEALTH" != "200" && "$SECONDARY_HEALTH" == "200" ]]; then
echo "Primary region is unhealthy, initiating failover..."
/scripts/failover.sh
fi
---
# Health Check CronJob
apiVersion: batch/v1
kind: CronJob
metadata:
name: health-check
namespace: disaster-recovery
spec:
schedule: "*/2 * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: health-checker
image: curlimages/curl:8.1.0
command:
- /bin/sh
- /scripts/health-check.sh
volumeMounts:
- name: scripts
mountPath: /scripts
volumes:
- name: scripts
configMap:
name: failover-scripts
defaultMode: 0755
restartPolicy: OnFailure
---
# Alerting Rules for Disaster Recovery
apiVersion: v1
kind: ConfigMap
metadata:
name: dr-alerting-rules
namespace: monitoring
data:
disaster-recovery.yml: |
groups:
- name: disaster-recovery
rules:
- alert: PrimaryRegionDown
expr: up{job="kubernetes-apiservers",cluster="primary-cluster"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Primary region is down"
description: "Primary Kubernetes cluster has been down for more than 5 minutes"
- alert: DatabaseReplicationLag
expr: postgres_replication_lag_seconds > 300
for: 2m
labels:
severity: warning
annotations:
summary: "Database replication lag is high"
description: "Database replication lag is {{ $value }} seconds"
- alert: CrossRegionConnectivityLoss
expr: probe_success{job="cross-region-probe"} == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Cross-region connectivity lost"
description: "Unable to reach secondary region from primary"
1. Initialize Terraform
terraform init
2. Plan Infrastructure
terraform plan -out=tfplan
3. Deploy Infrastructure
terraform apply tfplan
4. Configure kubectl contexts
aws eks update-kubeconfig --region us-east-1 --name primary-k8s-cluster --alias primary
aws eks update-kubeconfig --region us-west-2 --name secondary-k8s-cluster --alias secondary
aws eks update-kubeconfig --region eu-west-1 --name edge-k8s-cluster --alias edge
5. Install Istio
# Install Istio on primary cluster
kubectl --context=primary apply -f istio-setup.yaml
# Install Istio on secondary cluster
kubectl --context=secondary apply -f istio-setup.yaml
# Install Istio on edge cluster
kubectl --context=edge apply -f istio-setup.yaml
6. Deploy Applications
kubectl --context=primary apply -f multi-region-app.yaml
kubectl --context=secondary apply -f multi-region-app.yaml
kubectl --context=edge apply -f multi-region-app.yaml
7. Setup Monitoring
kubectl --context=primary apply -f monitoring-setup.yaml
kubectl --context=secondary apply -f monitoring-setup.yaml
kubectl --context=edge apply -f monitoring-setup.yaml
Cross-region connectivity issues
If services can't communicate across regions, check VPC peering connections and security groups.
# Check VPC peering status
aws ec2 describe-vpc-peering-connections
# Test connectivity
kubectl --context=primary exec -it test-pod -- nc -zv secondary-service.production.svc.cluster.local 80
Istio service mesh issues
If Istio isn't working properly, check the control plane status and proxy configurations.
# Check Istio status
istioctl proxy-status
# Check proxy configuration
istioctl proxy-config cluster <pod-name> -n production
Database replication lag
Monitor replication lag and adjust replication frequency if needed.
# Check replication lag
psql -h secondary-db -U postgres -d app_db -c "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) AS lag_seconds;"
What are the benefits of multi-region Kubernetes deployments?
Multi-region Kubernetes deployments provide high availability, disaster recovery, reduced latency for global users, compliance with data residency requirements, and protection against regional outages.
How do you handle data consistency across regions?
Data consistency is managed through distributed databases, cross-region replication, eventual consistency patterns, and careful application design with proper data partitioning strategies.
What networking considerations are important for multi-region K8s?
Key networking considerations include cross-region connectivity, service mesh configuration, ingress controllers, DNS management, and network security policies across regions.
How do you monitor multi-region Kubernetes clusters?
Monitoring involves centralized observability platforms, distributed tracing, cross-region metrics aggregation, alerting systems, and comprehensive logging across all clusters.
What are the cost implications of multi-region deployments?
Costs include additional infrastructure, cross-region data transfer, storage replication, and operational overhead. However, these are offset by improved availability and performance.
Security Enhancements
- Implement Pod Security Standards
- Add network policies
- Enable audit logging
- Implement RBAC policies
Operational Improvements
- Set up GitOps with ArgoCD
- Implement chaos engineering
- Add cost monitoring
- Automate certificate management
Ready to Deploy Multi-Region Kubernetes?
Get the complete configuration package with all Terraform files, Kubernetes manifests, and deployment scripts.