We are having a very strange behavior with unacceptable big latency for communication within a kubernetes cluster (GKE).
The latency is jumping between 600ms and 1s for a endpoint that has a Memorystore get/store action and a CloudSQL query. The same setup running locally in dev enivornment (although without k8s) is not showing this kind of latency.
About our architecture:
We are running a k8s cluster on GKE using terraform and service / deployment (yaml) files for the creation (I added those below).
We're running two node APIs (koa.js 2.5). One API is exposed with an ingress to the public and connects via a nodeport to the API pod.
The other API pod is private reachable through an internal loadbalancer from google. This API is connected to all the resource we need (CloudSQL, Cloud Storage).
Both APIs are also connected to a Memorystore (Redis).
The communication between those pods is secured with self-signed server/client certificates (which isn't the problem, we already removed it temporarily to test).
We checked the logs and saw that the request from the public API to the private one is taking about 200ms only to reach it.
Also the response to the public API from the private one took about 600ms (messured from the point when the whole business logic of the private API went throw until we received that response back at the pubilc API)
We're really out of things to try... We already connected all the Google Cloud resources to our local environment which didn't show that kind of bad latency.
In a complete local setup the latency is only about 1/5 to 1/10 of what we see in the cloud setup.
We also tried to ping the private POD from the public one which was in the 0.100ms area.
Do you have any ideas where we can further investigate ?
Here is the terraform script about our Google Cloud setup
// Configure the Google Cloud provider
provider "google" {
project = "${var.project}"
region = "${var.region}"
}
data "google_compute_zones" "available" {}
# Ensuring relevant service APIs are enabled in your project. Alternatively visit and enable the needed services
resource "google_project_service" "serviceapi" {
service = "serviceusage.googleapis.com"
disable_on_destroy = false
}
resource "google_project_service" "sqlapi" {
service = "sqladmin.googleapis.com"
disable_on_destroy = false
depends_on = ["google_project_service.serviceapi"]
}
resource "google_project_service" "redisapi" {
service = "redis.googleapis.com"
disable_on_destroy = false
depends_on = ["google_project_service.serviceapi"]
}
# Create a VPC and a subnetwork in our region
resource "google_compute_network" "appnetwork" {
name = "${var.environment}-vpn"
auto_create_subnetworks = "false"
}
resource "google_compute_subnetwork" "network-with-private-secondary-ip-ranges" {
name = "${var.environment}-vpn-subnet"
ip_cidr_range = "10.2.0.0/16"
region = "europe-west1"
network = "${google_compute_network.appnetwork.self_link}"
secondary_ip_range {
range_name = "kubernetes-secondary-range-pods"
ip_cidr_range = "10.60.0.0/16"
}
secondary_ip_range {
range_name = "kubernetes-secondary-range-services"
ip_cidr_range = "10.70.0.0/16"
}
}
# GKE cluster setup
resource "google_container_cluster" "primary" {
name = "${var.environment}-cluster"
zone = "${data.google_compute_zones.available.names[1]}"
initial_node_count = 1
description = "Kubernetes Cluster"
network = "${google_compute_network.appnetwork.self_link}"
subnetwork = "${google_compute_subnetwork.network-with-private-secondary-ip-ranges.self_link}"
depends_on = ["google_project_service.serviceapi"]
additional_zones = [
"${data.google_compute_zones.available.names[0]}",
"${data.google_compute_zones.available.names[2]}",
]
master_auth {
username = "xxxxxxx"
password = "xxxxxxx"
}
ip_allocation_policy {
cluster_secondary_range_name = "kubernetes-secondary-range-pods"
services_secondary_range_name = "kubernetes-secondary-range-services"
}
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/trace.append"
]
tags = ["kubernetes", "${var.environment}"]
}
}
##################
# MySQL DATABASES
##################
resource "google_sql_database_instance" "core" {
name = "${var.environment}-sql-core"
database_version = "MYSQL_5_7"
region = "${var.region}"
depends_on = ["google_project_service.sqlapi"]
settings {
# Second-generation instance tiers are based on the machine
# type. See argument reference below.
tier = "db-n1-standard-1"
}
}
resource "google_sql_database_instance" "tenant1" {
name = "${var.environment}-sql-tenant1"
database_version = "MYSQL_5_7"
region = "${var.region}"
depends_on = ["google_project_service.sqlapi"]
settings {
# Second-generation instance tiers are based on the machine
# type. See argument reference below.
tier = "db-n1-standard-1"
}
}
resource "google_sql_database_instance" "tenant2" {
name = "${var.environment}-sql-tenant2"
database_version = "MYSQL_5_7"
region = "${var.region}"
depends_on = ["google_project_service.sqlapi"]
settings {
# Second-generation instance tiers are based on the machine
# type. See argument reference below.
tier = "db-n1-standard-1"
}
}
resource "google_sql_database" "core" {
name = "project_core"
instance = "${google_sql_database_instance.core.name}"
}
resource "google_sql_database" "tenant1" {
name = "project_tenant_1"
instance = "${google_sql_database_instance.tenant1.name}"
}
resource "google_sql_database" "tenant2" {
name = "project_tenant_2"
instance = "${google_sql_database_instance.tenant2.name}"
}
##################
# MySQL USERS
##################
resource "google_sql_user" "core-user" {
name = "${var.sqluser}"
instance = "${google_sql_database_instance.core.name}"
host = "cloudsqlproxy~%"
password = "${var.sqlpassword}"
}
resource "google_sql_user" "tenant1-user" {
name = "${var.sqluser}"
instance = "${google_sql_database_instance.tenant1.name}"
host = "cloudsqlproxy~%"
password = "${var.sqlpassword}"
}
resource "google_sql_user" "tenant2-user" {
name = "${var.sqluser}"
instance = "${google_sql_database_instance.tenant2.name}"
host = "cloudsqlproxy~%"
password = "${var.sqlpassword}"
}
##################
# REDIS
##################
resource "google_redis_instance" "redis" {
name = "${var.environment}-redis"
tier = "BASIC"
memory_size_gb = 1
depends_on = ["google_project_service.redisapi"]
authorized_network = "${google_compute_network.appnetwork.self_link}"
region = "${var.region}"
location_id = "${data.google_compute_zones.available.names[0]}"
redis_version = "REDIS_3_2"
display_name = "Redis Instance"
}
# The following outputs allow authentication and connectivity to the GKE Cluster.
output "client_certificate" {
value = "${google_container_cluster.primary.master_auth.0.client_certificate}"
}
output "client_key" {
value = "${google_container_cluster.primary.master_auth.0.client_key}"
}
output "cluster_ca_certificate" {
value = "${google_container_cluster.primary.master_auth.0.cluster_ca_certificate}"
}
The service and deployment of the private API
# START CRUD POD
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: crud-pod
labels:
app: crud
spec:
template:
metadata:
labels:
app: crud
spec:
containers:
- name: crud
image: eu.gcr.io/dev-xxxxx/crud:latest-unstable
ports:
- containerPort: 3333
env:
- name: NODE_ENV
value: develop
volumeMounts:
- [..MountedConfigFiles..]
# [START proxy_container]
- name: cloudsql-proxy
image: gcr.io/cloudsql-docker/gce-proxy:1.11
command: ["/cloud_sql_proxy",
"-instances=dev-xxxx:europe-west1:dev-sql-core=tcp:3306,dev-xxxx:europe-west1:dev-sql-tenant1=tcp:3307,dev-xxxx:europe-west1:dev-sql-tenant2=tcp:3308",
"-credential_file=xxxx"]
volumeMounts:
- name: cloudsql-instance-credentials
mountPath: /secrets/cloudsql
readOnly: true
# [END proxy_container]
# [START volumes]
volumes:
- name: cloudsql-instance-credentials
secret:
secretName: cloudsql-instance-credentials
- [..ConfigFilesVolumes..]
# [END volumes]
# END CRUD POD
-------
# START CRUD SERVICE
apiVersion: v1
kind: Service
metadata:
name: crud
annotations:
cloud.google.com/load-balancer-type: "Internal"
spec:
type: LoadBalancer
loadBalancerSourceRanges:
- 10.60.0.0/16
ports:
- name: crud-port
port: 3333
protocol: TCP # default; can also specify UDP
selector:
app: crud # label selector for Pods to target
# END CRUD SERVICE
And the public one (including ingress)
# START SAPI POD
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: sapi-pod
labels:
app: sapi
spec:
template:
metadata:
labels:
app: sapi
spec:
containers:
- name: sapi
image: eu.gcr.io/dev-xxx/sapi:latest-unstable
ports:
- containerPort: 8080
env:
- name: NODE_ENV
value: develop
volumeMounts:
- [..MountedConfigFiles..]
volumes:
- [..ConfigFilesVolumes..]
# END SAPI POD
-------------
# START SAPI SERVICE
kind: Service
apiVersion: v1
metadata:
name: sapi # Service name
spec:
selector:
app: sapi
ports:
- port: 8080
targetPort: 8080
type: NodePort
# END SAPI SERVICE
--------------
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: dev-ingress
annotations:
kubernetes.io/ingress.global-static-ip-name: api-dev-static-ip
labels:
app: sapi-ingress
spec:
backend:
serviceName: sapi
servicePort: 8080
tls:
- hosts:
- xxxxx
secretName: xxxxx
We fixed the issue by removing the #google-cloud/logging-winston from our logTransport.
For some reason it blocked our traffic so that we got such bad latency.
Related
I have an AKS cluster with its LoadBalancer configured (following https://learn.microsoft.com/en-us/azure/aks/internal-lb ) so that it gets the IP from a PublicIP (all provisioned with Terraform) and targets the cluster ingress deployed with Helm.
resource "kubernetes_service" "server-loadbalacer" {
metadata {
name = "server-loadbalacer-svc"
annotations = {
"service.beta.kubernetes.io/azure-load-balancer-resource-group" = "fixit-resource-group"
}
}
spec {
type = "LoadBalancer"
load_balancer_ip = var.public_ip_address
selector = {
name = "ingress-service"
}
port {
name = "server-port"
protocol = "TCP"
port = 8080
}
}
}
Then with Helm I deploy a Node.js server listening on port 3000, a MongoDB replica set, and a Neo4 cluster.
I set up a service for the server receiving on port 3000 and targeting port 3000.
apiVersion: v1
kind: Service
metadata:
name: server-clusterip-service
spec:
type: ClusterIP
selector:
app: fixit-server-pod
ports:
- name: server-clusterip-service
protocol: TCP
port: 3000 # service port
targetPort: 3000 # por on whic the app is listening to
Then the Ingress redirects traffic to the correct service eg. server
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ingress-service
annotations:
kubernetes.io/ingress.class: nginx
labels:
name: ingress-service
spec:
rules:
- host: fixit.westeurope.cloudapp.azure.com #dns from Azure PublicIP
http:
paths:
- path: '/server/*'
pathType: Prefix
backend:
service:
name: server-clusterip-service
port:
number: 3000
- path: '/neo4j/*'
pathType: Prefix
backend:
service:
name: fixit-cluster
port:
number: 7687
number: 7474
number: 7473
- path: '/neo4j-admin/*'
pathType: Prefix
backend:
service:
name: fixit-cluster-admin
port:
number: 6362
number: 7687
number: 7474
number: 7473
I'm expecting to go to http://fixit.westeurope.cloudapp.azure.com:8080/server/api and see the message that the server response for the endpoint /api, but it fails at browser timeout.
Pods and services deployed on the cluster are
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl get pod
NAME READY STATUS RESTARTS AGE
fixit-cluster-0 1/1 Running 0 27m
fixit-server-868f657b64-hvmxq 1/1 Running 0 27m
mongo-rs-0 2/2 Running 0 27m
mongodb-kubernetes-operator-7c5666c957-sscsf 1/1 Running 0 4h35m
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
fixit-cluster ClusterIP 10.0.230.247 <none> 7687/TCP,7474/TCP,7473/TCP 27m
fixit-cluster-admin ClusterIP 10.0.132.24 <none> 6362/TCP,7687/TCP,7474/TCP,7473/TCP 27m
kubernetes ClusterIP 10.0.0.1 <none> 443/TCP 4h44m
mongo-rs-svc ClusterIP None <none> 27017/TCP 27m
server-clusterip-service ClusterIP 10.0.242.65 <none> 3000/TCP 27m
server-loadbalacer-svc LoadBalancer 10.0.149.160 52.174.18.27 8080:32660/TCP 4h41m
The ingress is deployed as
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl describe ingress ingress-service
Name: ingress-service
Labels: app.kubernetes.io/managed-by=Helm
name=ingress-service
Namespace: default
Address:
Ingress Class: <none>
Default backend: <default>
Rules:
Host Path Backends
---- ---- --------
fixit.westeurope.cloudapp.azure.com
/server/* server-clusterip-service:3000 (<none>)
/neo4j/* fixit-cluster:7473 (<none>)
/neo4j-admin/* fixit-cluster-admin:7473 (<none>)
Annotations: kubernetes.io/ingress.class: nginx
meta.helm.sh/release-name: fixit-cluster
meta.helm.sh/release-namespace: default
Events: <none>
and the server service is
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl describe svc server-clusterip-service
Name: server-clusterip-service
Namespace: default
Labels: app.kubernetes.io/managed-by=Helm
Annotations: meta.helm.sh/release-name: fixit-cluster
meta.helm.sh/release-namespace: default
Selector: app=fixit-server-pod
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.0.160.206
IPs: 10.0.160.206
Port: server-clusterip-service 3000/TCP
TargetPort: 3000/TCP
Endpoints: 10.244.0.15:3000
Session Affinity: None
Events: <none>
I tried setting the paths with and without /* but it won't connect in either case.
Is this setup even the right way to route external traffic to the cluster or should I use just the ingress? I see that this setup has been given as the solution (1st answer) to this question Kubernetes Load balancer without Label Selector and dough it looks like we're in the same situation, I'm on AKS, and the Azure docs https://learn.microsoft.com/en-us/azure/aks/ingress-basic?tabs=azure-cli are making me have doubts about my current setup.
Can you spot what I'm setting up wrongly if this setup is not a nonsense?
Many many thanks for the help.
UPDATE
as mentioned here https://learnk8s.io/terraform-aks the option http_application_routing_enabled = true in cluster creation installs addons
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl get pods -n kube-system | grep addon
addon-http-application-routing-external-dns-5d48bdffc6-q98nx 1/1 Running 0 26m
addon-http-application-routing-nginx-ingress-controller-5bcrf87 1/1 Running 0 26m
so the Ingress service should point to that controller in its annotations and not specify a host, so I changed the ingress service to
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ingress-service
annotations:
# kubernetes.io/ingress.class: nginx
kubernetes.io/ingress.class: addon-http-application-routing
# nginx.ingress.kubernetes.io/rewrite-target: /
labels:
name: ingress-service
spec:
rules:
# - host: fixit.westeurope.cloudapp.azure.com #server.com
- http:
paths:
- path: '/server/*' # service
# - path: '/server' # service doesn't get a IPaddress
# - path: '/*'
# - path: '/'
pathType: Prefix
backend:
service:
name: server-clusterip-service
port:
number: 3000
# - path: '/neo4j/*'
# pathType: Prefix
# backend:
# service:
# name: fixit-cluster
# port:
# number: 7687
# number: 7474
# number: 7473
# - path: '/neo4j-admin/*'
# pathType: Prefix
# backend:
# service:
# name: fixit-cluster-admin
# port:
# number: 6362
# number: 7687
# number: 7474
# number: 7473
and its output is now
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl get ingress
NAME CLASS HOSTS ADDRESS PORTS AGE
ingress-service <none> * 108.143.71.248 80 7s
vincenzocalia#vincenzos-MacBook-Air helm_charts % kubectl describe ingress ingress-service
Name: ingress-service
Labels: app.kubernetes.io/managed-by=Helm
name=ingress-service
Namespace: default
Address: 108.143.71.248
Ingress Class: <none>
Default backend: <default>
Rules:
Host Path Backends
---- ---- --------
*
/server/* server-clusterip-service:3000 (10.244.0.21:3000)
Annotations: kubernetes.io/ingress.class: addon-http-application-routing
meta.helm.sh/release-name: fixit-cluster
meta.helm.sh/release-namespace: default
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Sync 20s (x2 over 27s) nginx-ingress-controller Scheduled for sync
now going to http://108.143.71.248/server/api in the browser shows an Nginx 404 page.
I finally found the problem. It was my setup. I was using the default ingress-controller and load balancer that get created when you set the option http_application_routing_enabled = true on cluster creation which the docs are discouraging for production https://learn.microsoft.com/en-us/azure/aks/http-application-routing. So the proper implementation is to install an ingress controller https://learn.microsoft.com/en-us/azure/aks/ingress-basic?tabs=azure-cli, which hooks up the the internal load balancer, so there is no need to create one. Now, the Ingress controller will accept an ip address for the load balancer, but you have to create the PublicIP it in the node resource group because is going to look for it there and not in the resource group check the difference between the two here https://learn.microsoft.com/en-us/azure/aks/faq#why-are-two-resource-groups-created-with-aks.
So the working configuration is now:
main
terraform {
required_version = ">=1.1.0"
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = "~> 3.0.2"
}
}
}
provider "azurerm" {
features {
resource_group {
prevent_deletion_if_contains_resources = false
}
}
subscription_id = var.azure_subscription_id
tenant_id = var.azure_subscription_tenant_id
client_id = var.service_principal_appid
client_secret = var.service_principal_password
}
provider "kubernetes" {
host = "${module.cluster.host}"
client_certificate = "${base64decode(module.cluster.client_certificate)}"
client_key = "${base64decode(module.cluster.client_key)}"
cluster_ca_certificate = "${base64decode(module.cluster.cluster_ca_certificate)}"
}
provider "helm" {
kubernetes {
host = "${module.cluster.host}"
client_certificate = "${base64decode(module.cluster.client_certificate)}"
client_key = "${base64decode(module.cluster.client_key)}"
cluster_ca_certificate = "${base64decode(module.cluster.cluster_ca_certificate)}"
}
}
module "cluster" {
source = "./modules/cluster"
location = var.location
vm_size = var.vm_size
resource_group_name = var.resource_group_name
node_resource_group_name = var.node_resource_group_name
kubernetes_version = var.kubernetes_version
ssh_key = var.ssh_key
sp_client_id = var.service_principal_appid
sp_client_secret = var.service_principal_password
}
module "ingress-controller" {
source = "./modules/ingress-controller"
public_ip_address = module.cluster.public_ip_address
depends_on = [
module.cluster.public_ip_address
]
}
cluster
resource "azurerm_resource_group" "resource_group" {
name = var.resource_group_name
location = var.location
tags = {
Environment = "test"
Team = "DevOps"
}
}
resource "azurerm_kubernetes_cluster" "server_cluster" {
name = "server_cluster"
### choose the resource goup to use for the cluster
location = azurerm_resource_group.resource_group.location
resource_group_name = azurerm_resource_group.resource_group.name
### decide the name of the cluster "node" resource group, if unset will be named automatically
node_resource_group = var.node_resource_group_name
dns_prefix = "fixit"
kubernetes_version = var.kubernetes_version
# sku_tier = "Paid"
default_node_pool {
name = "default"
node_count = 1
min_count = 1
max_count = 3
vm_size = var.vm_size
type = "VirtualMachineScaleSets"
enable_auto_scaling = true
enable_host_encryption = false
# os_disk_size_gb = 30
}
service_principal {
client_id = var.sp_client_id
client_secret = var.sp_client_secret
}
tags = {
Environment = "Production"
}
linux_profile {
admin_username = "azureuser"
ssh_key {
key_data = var.ssh_key
}
}
network_profile {
network_plugin = "kubenet"
load_balancer_sku = "basic"
}
http_application_routing_enabled = false
depends_on = [
azurerm_resource_group.resource_group
]
}
resource "azurerm_public_ip" "public-ip" {
name = "fixit-public-ip"
location = var.location
# resource_group_name = var.resource_group_name
resource_group_name = var.node_resource_group_name
allocation_method = "Static"
domain_name_label = "fixit"
# sku = "Standard"
depends_on = [
azurerm_kubernetes_cluster.server_cluster
]
}
ingress controller
resource "helm_release" "nginx" {
name = "ingress-nginx"
repository = "ingress-nginx"
chart = "ingress-nginx/ingress-nginx"
namespace = "default"
set {
name = "controller.service.externalTrafficPolicy"
value = "Local"
}
set {
name = "controller.service.annotations.service.beta.kubernetes.io/azure-load-balancer-internal"
value = "true"
}
set {
name = "controller.service.loadBalancerIP"
value = var.public_ip_address
}
set {
name = "controller.service.annotations.service.beta.kubernetes.io/azure-load-balancer-health-probe-request-path"
value = "/healthz"
}
}
ingress service
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ingress-service
# namespace: default
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "false"
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2$3$4
spec:
ingressClassName: nginx
rules:
# - host: fixit.westeurope.cloudapp.azure.com #dns from Azure PublicIP
### Node.js server
- http:
paths:
- path: /(/|$)(.*)
pathType: Prefix
backend:
service:
name: server-clusterip-service
port:
number: 80
- http:
paths:
- path: /server(/|$)(.*)
pathType: Prefix
backend:
service:
name: server-clusterip-service
port:
number: 80
...
other services omitted
Hope this can help getting the setup right.
Cheers.
I have an application that is supposed to expose 2 x ports and the application does not have the default healthcheck endpoint of / that returns 200, so at the moment, I supply a custom healthcheck endpoint just for 1 port. I haven't exposed the other port yet as I don't know how to provide another custom healthcheck endpoint for the same application.
This is how my Terraform configuration looks like.
resource "kubernetes_deployment" "core" {
metadata {
name = "core"
labels = {
app = "core"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "core"
}
}
template {
metadata {
labels = {
app = "core"
}
}
spec {
container {
name = "core"
image = "asia.gcr.io/admin/core:${var.app_version}"
port {
container_port = 8069
}
readiness_probe {
http_get {
path = "/web/database/selector"
port = "8069"
}
initial_delay_seconds = 15
period_seconds = 30
}
image_pull_policy = "IfNotPresent"
}
}
}
}
}
resource "kubernetes_service" "core_service" {
metadata {
name = "core-service"
}
spec {
type = "NodePort"
selector = {
app = "core"
}
port {
port = 8080
protocol = "TCP"
target_port = "8069"
}
}
}
How do I tell GKE to expose the other port (8072) and use a custom healthcheck endpoint for both ports?
There are a GKE Ingress feature called FrontendConfig and BackendConfig custom resource definitions (CRDs) that allow you to further customize the load balancer, you can use a Unique BackendConfig per Service port to specify a custom BackendConfig for a specific port or ports of a Service or MultiClusterService, using a key that matches the port's name or port's number. The Ingress controller uses the specific BackendConfig when it creates a load balancer backend service for a referenced Service port
When using a BackendConfig to provide a custom load balancer health check, the port number you use for the load balancer's health check can differ from the Service's spec.ports[].port number, here's an example of the service and the custom health check:
Service:
apiVersion: v1
kind: Service
metadata:
annotations:
cloud.google.com/backend-config: '{"ports": {
"service-reference-a":"backendconfig-reference-a",
"service-reference-b":"backendconfig-reference-b"
}}'
spec:
ports:
- name: port-name-1
port: port-number-1
protocol: TCP
targetPort: 50000
- name: port-name-2
port: port-number-2
protocol: TCP
targetPort: 8080
Custom Health Check:
apiVersion: cloud.google.com/v1
kind: BackendConfig
metadata:
name: my-backendconfig
spec:
healthCheck:
checkIntervalSec: interval
timeoutSec: timeout
healthyThreshold: health-threshold
unhealthyThreshold: unhealthy-threshold
type: protocol
requestPath: path
port: port
I'm having a hard time getting EKS to expose an IP address to the public internet. Do I need to set up the ALB myself or do you get that for free as part of the EKS cluster? If I have to do it myself, do I need to define it in the terraform template file or in the kubernetes object yaml?
Here's my EKS cluster defined in Terraform along with what I think are the required permissions.
// eks.tf
resource "aws_iam_role" "eks_cluster_role" {
name = "${local.env_name}-eks-cluster-role"
assume_role_policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Effect = "Allow",
Principal = {
Service = "eks.amazonaws.com"
},
Action = "sts:AssumeRole"
}
]
})
}
resource "aws_iam_role_policy_attachment" "eks-AmazonEKSClusterPolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.eks_cluster_role.name
}
resource "aws_iam_role_policy_attachment" "eks-AmazonEKSVPCResourceController" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
role = aws_iam_role.eks_cluster_role.name
}
resource "aws_kms_key" "eks_key" {
description = "EKS KMS Key"
deletion_window_in_days = 7
enable_key_rotation = true
tags = {
Environment = local.env_name
Service = "EKS"
}
}
resource "aws_kms_alias" "eks_key_alias" {
target_key_id = aws_kms_key.eks_key.id
name = "alias/eks-kms-key-${local.env_name}"
}
resource "aws_eks_cluster" "eks_cluster" {
name = "${local.env_name}-eks-cluster"
role_arn = aws_iam_role.eks_cluster_role.arn
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
vpc_config {
subnet_ids = [aws_subnet.private_a.id, aws_subnet.private_b.id]
}
encryption_config {
resources = ["secrets"]
provider {
key_arn = aws_kms_key.eks_key.arn
}
}
tags = {
Environment = local.env_name
}
}
resource "aws_iam_role" "eks_node_group_role" {
name = "${local.env_name}-eks-node-group"
assume_role_policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Effect = "Allow",
Principal = {
Service = "ec2.amazonaws.com"
},
Action = "sts:AssumeRole"
}
]
})
}
resource "aws_iam_role_policy_attachment" "eks-node-group-AmazonEKSWorkerNodePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.eks_node_group_role.name
}
resource "aws_iam_role_policy_attachment" "eks-node-group-AmazonEKS_CNI_Policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.eks_node_group_role.name
}
resource "aws_iam_role_policy_attachment" "eks-node-group-AmazonEC2ContainerRegistryReadOnly" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.eks_node_group_role.name
}
resource "aws_eks_node_group" "eks_node_group" {
instance_types = var.node_group_instance_types
node_group_name = "${local.env_name}-eks-node-group"
node_role_arn = aws_iam_role.eks_node_group_role.arn
cluster_name = aws_eks_cluster.eks_cluster.name
subnet_ids = [aws_subnet.private_a.id, aws_subnet.private_b.id]
scaling_config {
desired_size = 1
max_size = 1
min_size = 1
}
// Ensure that IAM Role permissions are created before and deleted after EKS Node Group handling.
// Otherwise, EKS will not be able to properly delete EC2 Instances and Elastic Network Interfaces.
depends_on = [
aws_iam_role_policy_attachment.eks-node-group-AmazonEC2ContainerRegistryReadOnly,
aws_iam_role_policy_attachment.eks-node-group-AmazonEKS_CNI_Policy,
aws_iam_role_policy_attachment.eks-node-group-AmazonEKSWorkerNodePolicy,
]
And here's my kubernetes object yaml:
# hello-kubernetes.yaml
apiVersion: v1
kind: Service
metadata:
name: hello-kubernetes
spec:
type: LoadBalancer
ports:
- port: 80
targetPort: 8080
selector:
app: hello-kubernetes
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: hello-kubernetes
spec:
replicas: 3
selector:
matchLabels:
app: hello-kubernetes
template:
metadata:
labels:
app: hello-kubernetes
spec:
containers:
- name: hello-kubernetes
image: paulbouwer/hello-kubernetes:1.9
ports:
- containerPort: 8080
---
apiVersion: networking.k8s.io/v1beta1
kind: Ingress
metadata:
name: hello-ingress
spec:
backend:
serviceName: hello-kubernetes
servicePort: 80
I've run terraform apply and the cluster is up and running. I've installed eksctl and kubectl and run kubectl apply -f hello-kubernetes.yaml. The pods, service, and ingress appear to be running fine.
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
hello-kubernetes-6cb7cd595b-25bd9 1/1 Running 0 6h13m
hello-kubernetes-6cb7cd595b-lccdj 1/1 Running 0 6h13m
hello-kubernetes-6cb7cd595b-snwvr 1/1 Running 0 6h13m
$ kubectl get services
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
hello-kubernetes LoadBalancer 172.20.102.37 <pending> 80:32086/TCP 6h15m
$ kubectl get ingresses
NAME CLASS HOSTS ADDRESS PORTS AGE
hello-ingress <none> * 80 3h45m
What am I missing and which file does it belong in?
You need to install the AWS Load Balancer Controller by following the installation instructions; first you need to create IAM Role and permissions, this can be done with Terraform; then you need to apply Kubernetes Yaml for installing the controller into your cluster, this can be done with Helm or Kubectl.
You also need to be aware of the subnet tagging that is needed for e.g. creating a public or private facing load balancer.
Usually the way to go is to put an ALB and redirect traffic to the EKS cluster, managing it with the ALB Ingress Controller. This ingress controller will act as the communication between the cluster and your ALB, here is the AWS documentation that is pretty straight foward
EKS w/ALB
Other solution could be using an NGINX ingress controller with an NLB if the ALB doesn't suits your applications needs, as described in the following article
NGINX w/NLB
This happened with me too that after all the setup, I was not able to see the ingress address. The best way to debug this issue is to check the logs for the ingress controller. You can do this by:
Get the Ingress controller po name by using: kubectl get po -n kube-system
Check logs for the po using: kubectl logs <po_name> -n kube-system
This will point you to the exact issue as to why you are not seeing the address.
If you do not find any po running by the name ingress, then u will have to create ingress controller first.
I'm setting up traefik 2.0-alpha with Let's Encrypt certificates inside GKE, but now i'm in stupor with "server.go:3012: http: TLS handshake error from 10.32.0.1:2244: remote error: tls: bad certificate" error in container logs.
Connections via http working fine. When i try to connect via https, traefik return 404 with its own default certificates.
I found same problem for traefik v1 on github. Solution was in adding to config:
InsecureSkipVerify = true
passHostHeader = true
It doesn't help me.
Here is my configmap
apiVersion: v1
kind: ConfigMap
metadata:
name: traefik-ingress-configmap
namespace: kube-system
data:
traefik.toml: |
[Global]
sendAnonymousUsage = true
debug = true
logLevel = "DEBUG"
[ServersTransport]
InsecureSkipVerify = true
[entrypoints]
[entrypoints.web]
address = ":80"
[entryPoints.web-secure]
address = ":443"
[entrypoints.mongo-port]
address = ":11111"
[providers]
[providers.file]
[tcp] # YAY!
[tcp.routers]
[tcp.routers.everything-to-mongo]
entrypoints = ["mongo-port"]
rule = "HostSNI(`*`)" # Catches every request
service = "database"
[tcp.services]
[tcp.services.database.LoadBalancer]
[[tcp.services.database.LoadBalancer.servers]]
address = "mongodb-service.default.svc:11111"
[http]
[http.routers]
[http.routers.for-jupyterx-https]
entryPoints = ["web-secure"] # won't listen to entrypoint mongo-port
# rule = "Host(`clients-ui.ddns.net`)"
# rule = "Path(`/jupyterx`)" # abo /jupyterx/*
rule = "PathPrefix(`/jupyterx`)"
service = "jupyterx"
[http.routers.for-jupyterx.tls]
[http.routers.for-jupyterx-http]
entryPoints = ["web"] # won't listen to entrypoint mongo-port
# rule = "Host(`clients-ui.ddns.net`)"
# rule = "Path(`/jupyterx`)" # abo /jupyterx/*
rule = "PathPrefix(`/jupyterx`)"
service = "jupyterx"
[http.services]
[http.services.jupyterx.LoadBalancer]
PassHostHeader = true
# InsecureSkipVerify = true
[[http.services.jupyterx.LoadBalancer.servers]]
url = "http://jupyter-service.default.svc/"
weight = 100
[acme] # every router with TLS enabled will now be able to use ACME for its certificates
email = "account#mail.com"
storage = "acme.json"
# onHostRule = true # dynamic generation based on the Host() & HostSNI() matchers
caServer = "https://acme-staging-v02.api.letsencrypt.org/directory"
[acme.httpChallenge]
entryPoint = "web" # used during the challenge
And DaemonSet yaml:
# ---
# apiVersion: v1
# kind: ServiceAccount
# metadata:
# name: traefik-ingress-controller
# namespace: kube-system
---
kind: DaemonSet
apiVersion: extensions/v1beta1
metadata:
name: traefik-ingress-controller
namespace: kube-system
labels:
k8s-app: traefik-ingress-lb
spec:
template:
metadata:
labels:
k8s-app: traefik-ingress-lb
name: traefik-ingress-lb
spec:
serviceAccountName: traefik-ingress-controller
terminationGracePeriodSeconds: 60
volumes:
# - name: traefik-ui-tls-cert
# secret:
# secretName: traefik-ui-tls-cert
- name: traefik-ingress-configmap
configMap:
name: traefik-ingress-configmap
containers:
- image: traefik:2.0 # The official v2.0 Traefik docker image
name: traefik-ingress-lb
ports:
- name: http
containerPort: 80
hostPort: 80
- name: web-secure
containerPort: 443
hostPort: 443
- name: admin
containerPort: 8080
- name: mongodb
containerPort: 11111
volumeMounts:
- mountPath: "/config"
name: "traefik-ingress-configmap"
args:
- --api
- --configfile=/config/traefik.toml
---
kind: Service
apiVersion: v1
metadata:
name: traefik-ingress-service
namespace: kube-system
spec:
selector:
k8s-app: traefik-ingress-lb
ports:
- protocol: TCP
port: 80
name: web
- protocol: TCP
port: 443
name: web-secure
- protocol: TCP
port: 8080
name: admin
- port: 11111
protocol: TCP
name: mongodb
type: LoadBalancer
loadBalancerIP: 1.1.1.1
Have any suggestions, how to fix it?
Due to lack of manuals for traefik2.0-alpha, config file was written using only manual from traefik official page.
There is a "routers for HTTP & HTTPS" configuration example here https://docs.traefik.io/v2.0/routing/routers/ look like:
[http.routers]
[http.routers.Router-1-https]
rule = "Host(`foo-domain`) && Path(`/foo-path/`)"
service = "service-id"
[http.routers.Router-1.tls] # will terminate the TLS request
[http.routers.Router-1-http]
rule = "Host(`foo-domain`) && Path(`/foo-path/`)"
service = "service-id"
But working config looks like:
[http.routers]
[http.routers.Router-1-https]
rule = "Host(`foo-domain`) && Path(`/foo-path/`)"
service = "service-id"
[http.routers.Router-1-https.tls] # will terminate the TLS request
[http.routers.Router-1-http]
rule = "Host(`foo-domain`) && Path(`/foo-path/`)"
service = "service-id"
So, in my config string
[http.routers.for-jupyterx.tls]
should be changed on
[http.routers.for-jupyterx-https.tls]
I was trying to use io.fabric8 api to create a few resources in kubernetes using a pod-spec.yaml.
Config config = new ConfigBuilder()
.withNamespace("ag")
.withMasterUrl(K8_URL)
.build();
try (final KubernetesClient client = new DefaultKubernetesClient(config)) {
LOGGER.info("Master: " + client.getMasterUrl());
LOGGER.info("Loading File : " + args[0]);
Pod pod = client.pods().load(new FileInputStream(args[0])).get();
LOGGER.info("Pod created with name : " + pod.toString());
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
}
The above code works if the resource type is of POD. Similarly for other resource type it is working fine.
But if the yaml has multiple resource type like POD and service in the same file, how to use fabric8 Api ?
I was trying to use client.load(new FileInputStream(args[0])).createOrReplace(); but it is crashing with the below exception:
java.lang.NullPointerException
at java.net.URI$Parser.parse(URI.java:3042)
at java.net.URI.<init>(URI.java:588)
at io.fabric8.kubernetes.client.utils.URLUtils.join(URLUtils.java:48)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.getMandatory(BaseOperation.java:208)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.get(BaseOperation.java:177)
at io.fabric8.kubernetes.client.handlers.PodHandler.reload(PodHandler.java:53)
at io.fabric8.kubernetes.client.handlers.PodHandler.reload(PodHandler.java:32)
at io.fabric8.kubernetes.client.dsl.internal.NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicableListImpl.createOrReplace(NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicableListImpl.java:202)
at io.fabric8.kubernetes.client.dsl.internal.NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicableListImpl.createOrReplace(NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicableListImpl.java:62)
at com.nokia.k8s.InterpreterLanuch.main(InterpreterLanuch.java:66)
Yaml file used
apiVersion: v1
kind: Pod
metadata:
generateName: zep-ag-pod
annotations:
kubernetes.io/psp: restricted
spark-app-name: Zeppelin-spark-shared-process
namespace: ag
labels:
app: zeppelin
int-app-selector: shell-123
spec:
containers:
- name: ag-csf-zep
image: bcmt-registry:5000/zep-spark2.2:9
imagePullPolicy: IfNotPresent
command: ["/bin/bash"]
args: ["-c","echo Hi && sleep 60 && echo Done"]
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
runAsNonRoot: true
securityContext:
fsGroup: 2000
runAsUser: 1510
serviceAccount: csfzeppelin
serviceAccountName: csfzeppelin
---
apiVersion: v1
kind: Service
metadata:
name: zeppelin-service
namespace: ag
labels:
app: zeppelin
spec:
type: NodePort
ports:
- name: zeppelin-service
port: 30099
protocol: TCP
targetPort: 8080
selector:
app: zeppelin
You don't need to specify resource type whenever loading a file with multiple documents. You simply need to do:
// Load Yaml into Kubernetes resources
List<HasMetadata> result = client.load(new FileInputStream(args[0])).get();
// Apply Kubernetes Resources
client.resourceList(result).inNamespace(namespace).createOrReplace()