Assign memory ressource of Pods from Terraform - kubernetes

I have a K8S cluster on GCP where I have to run Data Science workload.
Some of they are in status "Evicted" because
The node was low on resource: memory. Container base was using 5417924Ki, which exceeds its request of 0.
I manage my architecture with Terraform and know how to manage cluster auto-scaling but I have no idea, even after reading the doc, how to manage this at a Pod level
resource "google_container_cluster" "k8s_cluster" {
name = "my-cluster-name
description = ""
location = var.default_region
network = var.network
subnetwork = var.subnetwork
initial_node_count = 1
remove_default_node_pool = true
ip_allocation_policy {
# VPC-native cluster using alias IP addresses
cluster_secondary_range_name = "gke-pods"
services_secondary_range_name = "gke-services"
}
maintenance_policy {
daily_maintenance_window {
start_time = "03:00"
}
}
master_authorized_networks_config {
cidr_blocks {
display_name = var.airflow.display_name
cidr_block = var.airflow.cidr_block
}
cidr_blocks {
display_name = var.gitlab.display_name
cidr_block = var.gitlab.cidr_block
}
}
network_policy {
enabled = false
}
private_cluster_config {
enable_private_endpoint = true
enable_private_nodes = true
master_ipv4_cidr_block = var.vpc_range_k8s_master
}
resource_labels = {
zone = var.zone
role = var.role
env = var.environment
}
# Disable basic auth and client certificate
master_auth {
username = ""
password = ""
client_certificate_config {
issue_client_certificate = false
}
}
cluster_autoscaling {
enabled = true
resource_limits {
resource_type = "cpu"
minimum = 1
maximum = 4
}
resource_limits {
resource_type = "memory"
minimum = 1
maximum = 2
}
}
}

Related

Getting Hashicorp Provider Error while creating aks cluster using terraform

I am getting error while creating AKS Cluster Using Terraform
Error:
Error: Failed to query available provider packages
Could not retrieve the list of available versions for provider hashicorp/file: provider registry registry.terraform.io does not have a provider named
registry.terraform.io/hashicorp/file
All modules should specify their required_providers so that external consumers will get the correct providers when using a module. To see which modules are currently depending on hashicorp/file, run the following command:
terraform providers
Above is the error i am facing. I have written Terraform code as shown below.
provider.tf:
============
provider "azurerm" {
features {}
}
terraform {
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = "2.39.0"
}
}
}
terraform.tfvars:
=================
resource_group_name = "a0474899701"
location = "CentralUS"
cluster_name = "aks01"
kubernetes_version = "1.24.4"
system_node_count = 2
user_node_count = 1
spot_node_count = 2
acr_name = "devops_acr_tf"
aks_network_plugin = "kubenet"
client_id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
client_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
main.tf:
========
# Create an Resource Group
resource "azurerm_resource_group" "aks-rg" {
name = var.resource_group_name
location = var.location
}
# Create an ACR instance
resource "azurerm_container_registry" "acr" {
name = var.acr_name
resource_group_name = azurerm_resource_group.aks-rg.name
location = var.location
sku = "Standard"
admin_enabled = false
}
# Create a role assignment to allow AKS to access ACR
resource "azurerm_role_assignment" "role_acrpull" {
scope = azurerm_container_registry.acr.id
role_definition_name = "AcrPull"
# principal_id = azurerm_kubernetes_cluster.aks.kubelet_identity.0.object_id
principal_id = azurerm_kubernetes_cluster.aks.kubelet_identity.0.client_id
skip_service_principal_aad_check = true
}
# Create a Kubernetes secret to hold the ACR credentials
# It holds the ACR credentials in a Docker config JSON format
resource "kubernetes_secret" "acr_credentials" {
metadata {
name = "acr-credentials"
}
data = {
".dockerconfigjson" = azurerm_container_registry.acr.docker_config_json
}
}
# Private Key Creation
resource "tls_private_key" "aks_ssh_key" {
algorithm = "RSA"
}
resource "tls_public_key" "aks_ssh_key" {
private_key_pem = tls_private_key.aks_ssh_key.private_key_pem
}
resource "file" "private_key" {
content = tls_private_key.aks_ssh_key.private_key_pem
filename = "aks_private_key.pem"
}
# virtual network (aks_vnet) is created in the same resource group
resource "azurerm_virtual_network" "aks_vnet" {
name = "${var.resource_group_name}-vnet01"
# address_space = ["10.0.0.0/16"]
address_space = ["10.172.144.0/26"]
location = azurerm_resource_group.aks_rg.location
resource_group_name = azurerm_resource_group.aks_rg.name
}
# subnet (aks_subnet) is created within the virtual network
resource "azurerm_subnet" "aks_subnet" {
name = "${var.resource_group_name}-vnet01-subnet01"
resource_group_name = azurerm_resource_group.aks_rg.name
virtual_network_name = azurerm_virtual_network.aks_vnet.name
# address_prefix = "10.0.1.0/24"
address_prefix = "10.172.144.0/27"
}
resource "azurerm_network_security_group" "azure-sg" {
name = "${var.resource_group_name}-nsg01"
location = azurerm_resource_group.aks_rg.location
resource_group_name = azurerm_resource_group.aks_rg.name
security_rule {
name = "allow-ssh"
priority = 100
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "*"
destination_port_range = "22"
source_address_prefix = "*"
destination_address_prefix = "*"
}
}
resource "azurerm_kubernetes_cluster" "aks" {
name = var.cluster_name
kubernetes_version = var.kubernetes_version
location = var.location
resource_group_name = azurerm_resource_group.aks-rg.name
security_group_name = azurerm_network_security_group.azure-sg.name
dns_prefix = var.cluster_name
default_node_pool {
name = "system"
node_count = var.system_node_count
vm_size = "Standard_E4as_v4"
os_disk_size_gb = 20
os_disk_type = "Ephemeral"
vnet_subnet_id = azurerm_subnet.aks_subnet.id
os_type = "Linux"
node_image_version = "AKSUbuntu-1804gen2containerd-2023.01.10"
enable_node_public_ip = false
enable_auto_scaling = false
}
additional_node_pools {
name = "user"
node_count = var.user_node_count
vm_size = "Standard_E8as_v4"
os_disk_size_gb = 20
os_disk_type = "Ephemeral"
vnet_subnet_id = azurerm_subnet.aks_subnet.id
type = "User"
# os_type = "RedHat"
os_type = "Linux"
node_image_version = "AKSUbuntu-1804gen2containerd-2023.01.10"
enable_node_public_ip = false
enable_auto_scaling = false
}
additional_node_pools {
name = "spot"
node_count = var.spot_node_count
vm_size = "Standard_D2s_v3"
os_disk_size_gb = 20
os_disk_type = "Ephemeral"
vnet_subnet_id = azurerm_subnet.aks_subnet.id
type = "User"
# os_type = "RedHat"
os_type = "Linux"
node_image_version = "AKSUbuntu-1804gen2containerd-2023.01.10"
max_price = 0.5
enable_node_public_ip = false
enable_auto_scaling = false
eviction_policy = "Spot"
taints = ["kubernetes.azure.com/scalesetpriority=spot:NoSchedule"]
labels = {
"kubernetes.azure.com/scalesetpriority" = "spot"
}
}
kubernetes_cluster_config {
max_pods_per_node = "110"
}
identity {
type = "SystemAssigned"
}
linux_profile {
admin_username = "azureuser"
ssh_key {
key_data = tls_public_key.aks_ssh_key.public_key_openssh
}
}
network_profile {
pod_cidr = "172.32.0.0/19"
service_cidr = "172.32.0.0/19"
load_balancer_sku = "Standard"
network_plugin = var.aks_network_plugin
dns_service_ip = "172.32.0.10"
docker_bridge_cidr = "172.34.0.1/16"
}
service_principal {
client_id = var.client_id
client_secret = var.client_secret
}
tags = {
Environment = "Development"
}
}
# ACR can be attached to the AKS cluster using the "azurerm_kubernetes_cluster_container_registry_config" resource type
resource "azurerm_kubernetes_cluster_container_registry_config" "acr_config" {
cluster_name = azurerm_kubernetes_cluster.aks.name
registry_id = azurerm_container_registry.acr.id
namespace = "aks"
default_action = "Allow"
}
Above is my Code I am facing above error. even i have changed my provider.tf still facing same issue. Can anyone please tell me How to solve this error
Thanks
I tried to reproduce the same in my environment to create AKS Cluster using Terraform:
Kindly use the below Terraform code to create AKS Cluster.
Terraform Code:
provider "azurerm" {
features {}
}
resource "azurerm_resource_group" "venkatesh" {
name = "venkat-resources"
location = "West Europe"
}
resource "azurerm_container_registry" "venkatreg" {
name = "Testcontainerregistery"
resource_group_name = azurerm_resource_group.venkatesh.name
location = azurerm_resource_group.venkatesh.location
sku = "Premium"
}
resource "azurerm_kubernetes_cluster" "venkatcluster" {
name = "example-aks1"
location = azurerm_resource_group.venkatesh.location
resource_group_name = azurerm_resource_group.venkatesh.name
dns_prefix = "exampleaks1"
default_node_pool {
name = "default"
node_count = 1
vm_size = "Standard_D2_v2"
}
identity {
type = "SystemAssigned"
}
tags = {
Environment = "Production"
}
}
resource "azurerm_role_assignment" "example" {
principal_id = azurerm_kubernetes_cluster.venkatcluster.kubelet_identity[0].object_id
role_definition_name = "AcrPull"
scope = azurerm_container_registry.venkatreg.id
skip_service_principal_aad_check = true
}
Terraform apply:
Once ran the code resources are created successfully.
Reference:
Create a Kubernetes cluster with Azure Kubernetes Service using Terraform.

Create Pods using Rancher with Terraform

I created this simple Terraform script with Rancher to create namespace in imported Kubernetes cluster:
terraform {
required_providers {
rancher2 = {
source = "rancher/rancher2"
version = "1.24.1"
}
}
}
provider "rancher2" {
api_url = "https://192.168.1.128/v3"
token_key = "token-n4fxx:4qcgctvph7qh2sdnn762zpzg889rgw8xpd2nvcnpnr4v4wpb9zljtd"
insecure = true
}
resource "rancher2_namespace" "zone-1" {
name = "zone-1"
project_id = "c-m-xmhbjzdt:p-sd86v"
description = "zone-1 namespace"
resource_quota {
limit {
limits_cpu = "100m"
limits_memory = "100Mi"
requests_storage = "1Gi"
}
}
container_resource_limit {
limits_cpu = "20m"
limits_memory = "20Mi"
requests_cpu = "1m"
requests_memory = "1Mi"
}
}
The question is how I can create Pods into the Kubernetes cluster using again Terraform script?
Terraform offers the Kubernetes Provider which allows you to create all kind of Kubernetes objects.
To quote the documentation of the "kubernetes_pod"-resource:
resource "kubernetes_pod" "test" {
metadata {
name = "terraform-example"
}
spec {
container {
image = "nginx:1.21.6"
name = "example"
env {
name = "environment"
value = "test"
}
port {
container_port = 80
}
liveness_probe {
http_get {
path = "/"
port = 80
http_header {
name = "X-Custom-Header"
value = "Awesome"
}
}
initial_delay_seconds = 3
period_seconds = 3
}
}
dns_config {
nameservers = ["1.1.1.1", "8.8.8.8", "9.9.9.9"]
searches = ["example.com"]
option {
name = "ndots"
value = 1
}
option {
name = "use-vc"
}
}
dns_policy = "None"
}
}

Post "https://***.eks.amazonaws.com/api/v1/persistentvolumes": dial tcp *****:443: i/o timeout

I'm getting this error when creating the persistent volume through terraform for EKS.
Created the EBS volume through terraform only.. It successfully created. But when try to create the Persistent volume getting the error
Please check the code below
resource "kubernetes_persistent_volume" "api-application-pv" {
metadata {
name = "api-application-pv"
}
spec {
capacity = {
storage = "2Gi"
}
access_modes = ["ReadWriteMany"]
persistent_volume_source {
aws_elastic_block_store {
volume_id = aws_launch_template.default.arn
}
}
}
}
resource "kubernetes_persistent_volume_claim" "api-application-pvc" {
metadata {
name = "api-application-pvc"
}
spec {
resources {
requests = {
storage = "2Gi"
}
}
access_modes = ["ReadWriteMany"]
storage_class_name = "gp2"
volume_name = "${kubernetes_persistent_volume.api-application-pv.metadata.0.name}"
}
wait_until_bound = false
depends_on = [kubernetes_persistent_volume.api-application-pv]
}
resource "aws_launch_template" "default" {
name_prefix = "eks-stage-template"
description = "eks-stage-template"
update_default_version = true
block_device_mappings {
device_name = "/dev/xvda"
ebs {
volume_size = 50
volume_type = "gp2"
delete_on_termination = true
encrypted = true
}
}

GKE node pool dont autoscaling to 0 nodes

I made a gke cluster using Terraform
resource "google_container_cluster" "airflow_prd" {
name = "airflow-prd"
remove_default_node_pool = true
initial_node_count = 1
network = var.vpc
location = var.zone_prd
subnetwork = var.subnet_prd
project = "xxxxxx"
private_cluster_config {
enable_private_endpoint = false
enable_private_nodes = true
master_ipv4_cidr_block = "172.13.0.0/28"
master_global_access_config {
enabled = true
}
}
ip_allocation_policy {
cluster_secondary_range_name = ""
}
}
resource "google_container_node_pool" "default_prd" {
name = "default"
cluster = google_container_cluster.airflow_prd.name
initial_node_count = 2
location = var.zone_prd
node_config {
machine_type = "e2-small"
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
service_account = "xxxxxxxxx.iam.gserviceaccount.com"
}
autoscaling {
max_node_count = 4
min_node_count = 2
}
}
resource "google_container_node_pool" "airflow_prd" {
name = "airflow"
cluster = google_container_cluster.airflow_prd.name
initial_node_count = 0
location = var.zone_prd
node_config {
machine_type = "e2-standard-8"
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
service_account = "xxxxxx.iam.gserviceaccount.com"
}
autoscaling {
max_node_count = 1
min_node_count = 0
}
}
resource "google_container_node_pool" "etl_32_prd" {
name = "etl-32"
cluster = google_container_cluster.airflow_prd.name
initial_node_count = 0
location = var.zone_prd
node_config {
machine_type = "e2-standard-8"
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
service_account = "xxxxxx.iam.gserviceaccount.com"
}
autoscaling {
max_node_count = 4
min_node_count = 0
}
}
The problem is with node pool etl-32. It automatically creates nodes when needed. When it is no longer needed, the number of nodes is reduced to 1, not to 0, which is what I want. How to make it go down to 0? The system pods are all in node pool default_prd, which always has 2 nodes

terraform azurerm - cannot destroy public ip

New to terraform so i'm hoping this is an easy issue. I'm creating some resources in azure and deploying a simple flask application to AKS. Creating works fine using terraform plan. I can see that azure is provisioned correctly and I can hit the flask app.
When I try to run terraform destroy I get the error - "StatusCode=400...In order to delete the public IP, disassociate/detach the Public IP address from the resource.
Main.tf
variable "subscription_id" {}
variable "client_id" {}
variable "client_secret" {}
variable "tenant_id" {}
provider "azurerm" {
version = "=1.28.0"
tenant_id = "${var.tenant_id}"
subscription_id = "${var.subscription_id}"
}
resource "azurerm_resource_group" "aks" {
name = "${var.name_prefix}"
location = "${var.location}"
}
resource "azurerm_kubernetes_cluster" "k8s" {
name = "${var.name_prefix}-aks"
kubernetes_version = "${var.kubernetes_version}"
location = "${azurerm_resource_group.aks.location}"
resource_group_name = "${azurerm_resource_group.aks.name}"
dns_prefix = "AKS-${var.dns_prefix}"
agent_pool_profile {
name = "${var.node_pool_name}"
count = "${var.node_pool_size}"
vm_size = "${var.node_pool_vmsize}"
os_type = "${var.node_pool_os}"
os_disk_size_gb = 30
}
service_principal {
client_id = "${var.client_id}"
client_secret = "${var.client_secret}"
}
tags = {
environment = "${var.env_tag}"
}
}
provider "helm" {
install_tiller = true
kubernetes {
host = "${azurerm_kubernetes_cluster.k8s.kube_config.0.host}"
client_certificate = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.client_certificate)}"
client_key = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.client_key)}"
cluster_ca_certificate = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.cluster_ca_certificate)}"
}
}
# Create Static Public IP Address to be used by Nginx Ingress
resource "azurerm_public_ip" "nginx_ingress" {
name = "nginx-ingress-public-ip"
location = "${azurerm_kubernetes_cluster.k8s.location}"
resource_group_name = "${azurerm_kubernetes_cluster.k8s.node_resource_group}"
allocation_method = "Static"
domain_name_label = "${var.name_prefix}"
}
# Add Kubernetes Stable Helm charts repo
data "helm_repository" "stable" {
name = "stable"
url = "https://kubernetes-charts.storage.googleapis.com"
}
# Install Nginx Ingress using Helm Chart
resource "helm_release" "nginx_ingress" {
name = "nginx-ingress"
repository = "${data.helm_repository.stable.metadata.0.name}"
chart = "nginx-ingress"
set {
name = "rbac.create"
value = "false"
}
set {
name = "controller.service.externalTrafficPolicy"
value = "Local"
}
set {
name = "controller.service.loadBalancerIP"
value = "${azurerm_public_ip.nginx_ingress.ip_address}"
}
}
Also deploying my kubernetes stuff in this file k8s.tf
provider "kubernetes" {
host = "${azurerm_kubernetes_cluster.k8s.kube_config.0.host}"
username = "${azurerm_kubernetes_cluster.k8s.kube_config.0.username}"
password = "${azurerm_kubernetes_cluster.k8s.kube_config.0.password}"
client_certificate = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.client_certificate)}"
client_key = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.client_key)}"
cluster_ca_certificate = "${base64decode(azurerm_kubernetes_cluster.k8s.kube_config.0.cluster_ca_certificate)}"
}
resource "kubernetes_deployment" "flask-api-deployment" {
metadata {
name = "flask-api-deployment"
}
spec {
replicas = 2
selector {
match_labels {
component = "api"
}
}
template {
metadata {
labels = {
component = "api"
}
}
spec {
container {
image = "xxx.azurecr.io/sampleflask:0.1.0"
name = "flask-api"
port {
container_port = 5000
}
}
}
}
}
}
resource "kubernetes_service" "api-cluster-ip-service" {
metadata {
name = "flask-api-cluster-ip-service"
}
spec {
selector {
component = "api"
}
port {
port = 5000
target_port = 5000
}
}
}
resource "kubernetes_ingress" "flask-ingress-service" {
metadata {
name = "flask-ingress-service"
}
spec {
backend {
service_name = "flask-api-cluster-ip-service"
service_port = 5000
}
}
}
For your issue, this is a problem about the sequence of the resources. When you create the nginx ingress with the public IP, the public IP should be created first. But when you delete the public IP, it's still in use by the nginx ingress. So It causes the error.
The solution is that you can detach the public IP from the resource which uses it. Then use the destroy the resource from the Terraform. You can take a look at the explanation in the issue.
The user #4c74356b41 is right, but to give more information assuming a config like this:
resource "azurerm_kubernetes_cluster" "k8s" {
name = "aks-e2x-nuffield-uat"
resource_group_name = azurerm_resource_group.core_rg.name
location = azurerm_resource_group.core_rg.location
dns_prefix = "aks-e2x-nuffield-uat-dns"
kubernetes_version = var.k8s_version
# NOTE currently only a single node pool, default, is configured
private_cluster_enabled = true
...
network_profile {
network_plugin = "kubenet"
load_balancer_sku = "standard"
service_cidr = var.k8s_service_subnet
pod_cidr = var.k8s_pod_subnet
docker_bridge_cidr = "172.17.0.1/16"
dns_service_ip = "40.0.8.10" # within the service subnet
}
}
Where the load_balancer_sku is set to standard, you can access the public IP to be used elsewhere like this:
data "azurerm_public_ip" "k8s_load_balancer_ip" {
name = reverse(split("/", tolist(azurerm_kubernetes_cluster.k8s.network_profile.0.load_balancer_profile.0.effective_outbound_ips)[0]))[0]
resource_group_name = azurerm_kubernetes_cluster.k8s.node_resource_group
}
output "ingress_public_ip" {
# value = azurerm_public_ip.ingress.ip_address
value = data.azurerm_public_ip.k8s_load_balancer_ip.ip_address
}