I have been working for nearly 5 straight days now on this and can't get this to work. According to AWS documentation I should* be able to mount an EFS Volume to a pod deployed to a fargate node in kubernetes (EKS).
I'm doing everything 100% through terraform. I'm lost at this point and my eyes are practically bleeding from the amount of terrible documentation I have read. Any guidance that anyone can give me on getting this to work would be amazing!
Here is what I have done so far:
Setup an EKS CSI driver, storage class, and role bindings (not really sure why I need these role bindings tbh)
resource "kubernetes_csi_driver" "efs" {
metadata {
name = "efs.csi.aws.com"
}
spec {
attach_required = false
volume_lifecycle_modes = [
"Persistent"
]
}
}
resource "kubernetes_storage_class" "efs" {
metadata {
name = "efs-sc"
}
storage_provisioner = kubernetes_csi_driver.efs.metadata[0].name
reclaim_policy = "Retain"
}
resource "kubernetes_cluster_role_binding" "efs_pre" {
metadata {
name = "efs_role_pre"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = "cluster-admin"
}
subject {
kind = "ServiceAccount"
name = "default"
namespace = "pre"
}
}
resource "kubernetes_cluster_role_binding" "efs_live" {
metadata {
name = "efs_role_live"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = "cluster-admin"
}
subject {
kind = "ServiceAccount"
name = "default"
namespace = "live"
}
}
Setup The EFS Volume with policies and security groups
module "vpc" {
source = "../../read_only_data/vpc"
stackname = var.vpc_stackname
}
resource "aws_efs_file_system" "efs_data" {
creation_token = "xva-${var.environment}-pv-efsdata-${var.side}"
# encrypted = true
# kms_key_id = ""
performance_mode = "generalPurpose" #maxIO
throughput_mode = "bursting"
lifecycle_policy {
transition_to_ia = "AFTER_30_DAYS"
}
}
data "aws_efs_file_system" "efs_data" {
file_system_id = aws_efs_file_system.efs_data.id
}
resource "aws_efs_access_point" "efs_data" {
file_system_id = aws_efs_file_system.efs_data.id
}
/* Policy that does the following:
- Prevent root access by default
- Enforce read-only access by default
- Enforce in-transit encryption for all clients
*/
resource "aws_efs_file_system_policy" "efs_data" {
file_system_id = aws_efs_file_system.efs_data.id
policy = jsonencode({
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"AWS": "*"
},
"Action": "elasticfilesystem:ClientMount",
"Resource": aws_efs_file_system.efs_data.arn
},
{
"Effect": "Deny",
"Principal": {
"AWS": "*"
},
"Action": "*",
"Resource": aws_efs_file_system.efs_data.arn,
"Condition": {
"Bool": {
"aws:SecureTransport": "false"
}
}
}
]
})
}
# Security Groups for this volume
resource "aws_security_group" "allow_eks_cluster" {
name = "xva-${var.environment}-efsdata-${var.side}"
description = "This will allow the cluster ${data.terraform_remote_state.cluster.outputs.eks_cluster_name} to access this volume and use it."
vpc_id = module.vpc.vpc_id
ingress {
description = "NFS For EKS Cluster ${data.terraform_remote_state.cluster.outputs.eks_cluster_name}"
from_port = 2049
to_port = 2049
protocol = "tcp"
security_groups = [
data.terraform_remote_state.cluster.outputs.eks_cluster_sg_id
]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "allow_tls"
}
}
# Mount to the subnets that will be using this efs volume
# Also attach sg's to restrict access to this volume
resource "aws_efs_mount_target" "efs_data-app01" {
file_system_id = aws_efs_file_system.efs_data.id
subnet_id = module.vpc.private_app_subnet_01
security_groups = [
aws_security_group.allow_eks_cluster.id
]
}
resource "aws_efs_mount_target" "efs_data-app02" {
file_system_id = aws_efs_file_system.efs_data.id
subnet_id = module.vpc.private_app_subnet_02
security_groups = [
aws_security_group.allow_eks_cluster.id
]
}
Create a Persistant Volume referencing the EFS Volume in kubernetes
data "terraform_remote_state" "csi" {
backend = "s3"
config = {
bucket = "xva-${var.account_type}-terraform-${var.region_code}"
key = "${var.environment}/efs/driver/terraform.tfstate"
region = var.region
profile = var.profile
}
}
resource "kubernetes_persistent_volume" "efs_data" {
metadata {
name = "pv-efsdata"
labels = {
app = "example"
}
}
spec {
access_modes = ["ReadOnlyMany"]
capacity = {
storage = "25Gi"
}
volume_mode = "Filesystem"
persistent_volume_reclaim_policy = "Retain"
storage_class_name = data.terraform_remote_state.csi.outputs.storage_name
persistent_volume_source {
csi {
driver = data.terraform_remote_state.csi.outputs.csi_name
volume_handle = aws_efs_file_system.efs_data.id
read_only = true
}
}
}
}
Then create a deployment to fargate with the pod mounting the EFS volume
data "terraform_remote_state" "efs_data_volume" {
backend = "s3"
config = {
bucket = "xva-${var.account_type}-terraform-${var.region_code}"
key = "${var.environment}/efs/volume/terraform.tfstate"
region = var.region
profile = var.profile
}
}
resource "kubernetes_persistent_volume_claim" "efs_data" {
metadata {
name = "pv-efsdata-claim-${var.side}"
namespace = var.side
}
spec {
access_modes = ["ReadOnlyMany"]
storage_class_name = data.terraform_remote_state.csi.outputs.storage_name
resources {
requests = {
storage = "25Gi"
}
}
volume_name = data.terraform_remote_state.efs_data_volume.outputs.volume_name
}
}
resource "kubernetes_deployment" "example" {
timeouts {
create = "3m"
update = "4m"
delete = "2m"
}
metadata {
name = "deployment-example"
namespace = var.side
labels = {
app = "example"
platform = "fargate"
subnet = "app"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "example"
}
}
template {
metadata {
labels = {
app = "example"
platform = "fargate"
subnet = "app"
}
}
spec {
volume {
name = "efs-data-volume"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.efs_data.metadata[0].name
read_only = true
}
}
container {
image = "${var.nexus_docker_endpoint}/example:${var.docker_tag}"
name = "example"
env {
name = "environment"
value = var.environment
}
env {
name = "dockertag"
value = var.docker_tag
}
volume_mount {
name = "efs-data-volume"
read_only = true
mount_path = "/appconf/"
}
# liveness_probe {
# http_get {
# path = "/health"
# port = 443
# }
# initial_delay_seconds = 3
# period_seconds = 3
# }
port {
container_port = 443
}
}
}
}
}
}
It can see the persistant volume in kuberenetes, I can see that it is claimed, heck I can even see that it attempts to mount the volume in the pod logs. However, I inevitably always see the following error when describing the pod:
Volumes:
efs-data-volume:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: pv-efsdata-claim-pre
ReadOnly: true
...
...
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedMount 11m (x629 over 23h) kubelet, <redacted-fargate-endpoint> Unable to attach or mount volumes: unmounted volumes=[efs-data-volume], unattached volumes=[efs-data-volume]: timed out waiting for the condition
Warning FailedMount 47s (x714 over 23h) kubelet, <redacted-fargate-endpoint> MountVolume.SetUp failed for volume "pv-efsdata" : kubernetes.io/csi: mounter.SetupAt failed: rpc error: code = InvalidArgument desc = Volume capability not supported
I finally have done it. I have successfully mounted an EFS Volume to a Fargate Pod (nearly 6 days later)! I was able to get the direction I need from this closed github issue: https://github.com/aws/containers-roadmap/issues/826
It ended up being that I am using this module to build my eks cluster: https://registry.terraform.io/modules/cloudposse/eks-cluster/aws/0.29.0?tab=outputs
If you use the output "security_group_id" it outputs the "Additional Security group". Which in my experience is good for absolutely nothing in aws. Not sure why it even exists when you can't do anything with it. The security group I needed to use was the "Cluster security group". So I added the "Cluster security group"'s id on port 2049 ingress rule on the EFS volumes security groups mount point and BAM! I mounted the EFS volume to the deployed pod successfully.
The other important change was I changed the persistant volume type to be ReadWriteMany, since fargate apparently doesn't support ReadOnlyMany.
Related
I'm trying to deploy an EKS self managed with Terraform. While I can deploy the cluster with addons, vpc, subnet and all other resources, it always fails at helm:
Error: Kubernetes cluster unreachable: the server has asked for the client to provide credentials
with module.eks-ssp-kubernetes-addons.module.ingress_nginx[0].helm_release.nginx[0]
on .terraform/modules/eks-ssp-kubernetes-addons/modules/kubernetes-addons/ingress-nginx/main.tf line 19, in resource "helm_release" "nginx":
resource "helm_release" "nginx" {
This error repeats for metrics_server, lb_ingress, argocd, but cluster-autoscaler throws:
Warning: Helm release "cluster-autoscaler" was created but has a failed status.
with module.eks-ssp-kubernetes-addons.module.cluster_autoscaler[0].helm_release.cluster_autoscaler[0]
on .terraform/modules/eks-ssp-kubernetes-addons/modules/kubernetes-addons/cluster-autoscaler/main.tf line 1, in resource "helm_release" "cluster_autoscaler":
resource "helm_release" "cluster_autoscaler" {
My main.tf looks like this:
terraform {
backend "remote" {}
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 3.66.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = ">= 2.7.1"
}
helm = {
source = "hashicorp/helm"
version = ">= 2.4.1"
}
}
}
data "aws_eks_cluster" "cluster" {
name = module.eks-ssp.eks_cluster_id
}
data "aws_eks_cluster_auth" "cluster" {
name = module.eks-ssp.eks_cluster_id
}
provider "aws" {
access_key = "xxx"
secret_key = "xxx"
region = "xxx"
assume_role {
role_arn = "xxx"
}
}
provider "kubernetes" {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
token = data.aws_eks_cluster_auth.cluster.token
}
provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
token = data.aws_eks_cluster_auth.cluster.token
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
}
}
My eks.tf looks like this:
module "eks-ssp" {
source = "github.com/aws-samples/aws-eks-accelerator-for-terraform"
# EKS CLUSTER
tenant = "DevOpsLabs2b"
environment = "dev-test"
zone = ""
terraform_version = "Terraform v1.1.4"
# EKS Cluster VPC and Subnet mandatory config
vpc_id = "xxx"
private_subnet_ids = ["xxx","xxx", "xxx", "xxx"]
# EKS CONTROL PLANE VARIABLES
create_eks = true
kubernetes_version = "1.19"
# EKS SELF MANAGED NODE GROUPS
self_managed_node_groups = {
self_mg = {
node_group_name = "DevOpsLabs2b"
subnet_ids = ["xxx","xxx", "xxx", "xxx"]
create_launch_template = true
launch_template_os = "bottlerocket" # amazonlinux2eks or bottlerocket or windows
custom_ami_id = "xxx"
public_ip = true # Enable only for public subnets
pre_userdata = <<-EOT
yum install -y amazon-ssm-agent \
systemctl enable amazon-ssm-agent && systemctl start amazon-ssm-agent \
EOT
disk_size = 10
instance_type = "t2.small"
desired_size = 2
max_size = 10
min_size = 0
capacity_type = "" # Optional Use this only for SPOT capacity as capacity_type = "spot"
k8s_labels = {
Environment = "dev-test"
Zone = ""
WorkerType = "SELF_MANAGED_ON_DEMAND"
}
additional_tags = {
ExtraTag = "t2x-on-demand"
Name = "t2x-on-demand"
subnet_type = "public"
}
create_worker_security_group = false # Creates a dedicated sec group for this Node Group
},
}
}
enable_amazon_eks_vpc_cni = true
amazon_eks_vpc_cni_config = {
addon_name = "vpc-cni"
addon_version = "v1.7.5-eksbuild.2"
service_account = "aws-node"
resolve_conflicts = "OVERWRITE"
namespace = "kube-system"
additional_iam_policies = []
service_account_role_arn = ""
tags = {}
}
enable_amazon_eks_kube_proxy = true
amazon_eks_kube_proxy_config = {
addon_name = "kube-proxy"
addon_version = "v1.19.8-eksbuild.1"
service_account = "kube-proxy"
resolve_conflicts = "OVERWRITE"
namespace = "kube-system"
additional_iam_policies = []
service_account_role_arn = ""
tags = {}
}
#K8s Add-ons
enable_aws_load_balancer_controller = true
enable_metrics_server = true
enable_cluster_autoscaler = true
enable_aws_for_fluentbit = true
enable_argocd = true
enable_ingress_nginx = true
depends_on = [module.eks-ssp.self_managed_node_groups]
}
OP has confirmed in the comment that the problem was resolved:
Of course. I think I found the issue. Doing "kubectl get svc" throws: "An error occurred (AccessDenied) when calling the AssumeRole operation: User: arn:aws:iam::xxx:user/terraform_deploy is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::xxx:user/terraform_deploy"
Solved it by using my actual role, that's crazy. No idea why it was calling itself.
For similar problem look also this issue.
I solved this error by adding dependencies in the helm installations.
The depends_on will wait for the step to successfully complete and then helm module runs.
module "nginx-ingress" {
depends_on = [module.eks, module.aws-load-balancer-controller]
source = "terraform-module/release/helm"
...}
module "aws-load-balancer-controller" {
depends_on = [module.eks]
source = "terraform-module/release/helm"
...}
module "helm_autoscaler" {
depends_on = [module.eks]
source = "terraform-module/release/helm"
...}
I'm currently trying to create a Persistent Volume Claim but is throwing the error:
"Cannot bind to requested volume "pv": storageClassName does not match"
resource "google_compute_disk" "default" {
name = "pv"
type = "pd-ssd"
zone = "us-east1-a"
size = 2
}
resource "kubernetes_persistent_volume" "default" {
metadata {
name = "pv"
}
spec {
capacity = {
storage = "2Gi"
}
access_modes = ["ReadWriteMany"]
persistent_volume_source {
gce_persistent_disk {
pd_name = "pv"
fs_type = "ext4"
}
}
}
}
resource "kubernetes_persistent_volume_claim" "default" {
metadata {
name = "pv"
}
spec {
access_modes = ["ReadWriteMany"]
resources {
requests = {
storage = "2Gi"
}
}
storage_class_name = "pv"
volume_name = "pv"
}
}
I've tried to follow Why does a match Persistent Volume not bind to a match Persistent Volume Claim (using k3s)?, however I still get the same issue of "Cannot bind to requested volume "pv": storageClassName does not match"
How can I fix this issue? - I've tried to ensure that the name of all the resources match, but I think I'm missing something.
Any help on this will be greatly appreciated!
You might change
storage_class_name = "pv"
to
storage_class_name = "manual"
I have a K8 cluster that has smb mounted drives connected to an AWS Storage Gateway / file share. We've recently undergone a migration of that SGW to another AWS account and while doing that the IP address and password for that SGW changed.
I noticed that our existing setup has a K8 storage class that looks for a K8 secret called "smbcreds". In that K8 secret they have keys "username" and "password". I'm assuming it's in line with the setup guide for the Helm chart we're using "csi-driver-smb".
I assumed changing the secret used for the storage class would update everything downstream that uses that storage class, but apparently it does not. I'm obviously a little cautious when it comes to potentially blowing away important data, what do I need to do to update everything to use the new secret and IP config?
Here is a simple example of our setup in Terraform -
provider "kubernetes" {
config_path = "~/.kube/config"
config_context = "minikube"
}
resource "helm_release" "container_storage_interface_for_aws" {
count = 1
name = "local-filesystem-csi"
repository = "https://raw.githubusercontent.com/kubernetes-csi/csi-driver-smb/master/charts"
chart = "csi-driver-smb"
namespace = "default"
}
resource "kubernetes_storage_class" "aws_storage_gateway" {
count = 1
metadata {
name = "smbmount"
}
storage_provisioner = "smb.csi.k8s.io"
reclaim_policy = "Retain"
volume_binding_mode = "WaitForFirstConsumer"
parameters = {
source = "//1.2.3.4/old-file-share"
"csi.storage.k8s.io/node-stage-secret-name" = "smbcreds"
"csi.storage.k8s.io/node-stage-secret-namespace" = "default"
}
mount_options = ["vers=3.0", "dir_mode=0777", "file_mode=0777"]
}
resource "kubernetes_persistent_volume_claim" "aws_storage_gateway" {
count = 1
metadata {
name = "smbmount-volume-claim"
}
spec {
access_modes = ["ReadWriteMany"]
resources {
requests = {
storage = "10Gi"
}
}
storage_class_name = "smbmount"
}
}
resource "kubernetes_deployment" "main" {
metadata {
name = "sample-pod"
}
spec {
replicas = 1
selector {
match_labels = {
app = "sample-pod"
}
}
template {
metadata {
labels = {
app = "sample-pod"
}
}
spec {
volume {
name = "shared-fileshare"
persistent_volume_claim {
claim_name = "smbmount-volume-claim"
}
}
container {
name = "ubuntu"
image = "ubuntu"
command = ["sleep", "3600"]
image_pull_policy = "IfNotPresent"
volume_mount {
name = "shared-fileshare"
read_only = false
mount_path = "/data"
}
}
}
}
}
}
My original change was to change the K8 secret "smbcreds" and change source = "//1.2.3.4/old-file-share" to source = "//5.6.7.8/new-file-share"
The solution I settled on was to create a second K8 storage class and persistent volume claim that's connected to the new AWS Storage Gateway. I then switched the K8 deployments to use the new PVC.
I have a kubernetes setup with a cluster and two pools (nodes), I have also setup an (nginx) ingress server for kubernetes with helm. All of this is written in terraform for scaleway. What I am struggling with is how to config the ingress server to route to my kubernetes pools/nodes depending on the url path.
For example, I want [url]/api to go to my scaleway_k8s_pool.api and [url]/auth to go to my scaleway_k8s_pool.auth.
This is my terraform code
provider "scaleway" {
zone = "fr-par-1"
region = "fr-par"
}
resource "scaleway_registry_namespace" "main" {
name = "main_container_registry"
description = "Main container registry"
is_public = false
}
resource "scaleway_k8s_cluster" "main" {
name = "main"
description = "The main cluster"
version = "1.20.5"
cni = "calico"
tags = ["i'm an awsome tag"]
autoscaler_config {
disable_scale_down = false
scale_down_delay_after_add = "5m"
estimator = "binpacking"
expander = "random"
ignore_daemonsets_utilization = true
balance_similar_node_groups = true
expendable_pods_priority_cutoff = -5
}
}
resource "scaleway_k8s_pool" "api" {
cluster_id = scaleway_k8s_cluster.main.id
name = "api"
node_type = "DEV1-M"
size = 1
autoscaling = true
autohealing = true
min_size = 1
max_size = 5
}
resource "scaleway_k8s_pool" "auth" {
cluster_id = scaleway_k8s_cluster.main.id
name = "auth"
node_type = "DEV1-M"
size = 1
autoscaling = true
autohealing = true
min_size = 1
max_size = 5
}
resource "null_resource" "kubeconfig" {
depends_on = [scaleway_k8s_pool.api, scaleway_k8s_pool.auth] # at least one pool here
triggers = {
host = scaleway_k8s_cluster.main.kubeconfig[0].host
token = scaleway_k8s_cluster.main.kubeconfig[0].token
cluster_ca_certificate = scaleway_k8s_cluster.main.kubeconfig[0].cluster_ca_certificate
}
}
output "cluster_url" {
value = scaleway_k8s_cluster.main.apiserver_url
}
provider "helm" {
kubernetes {
host = null_resource.kubeconfig.triggers.host
token = null_resource.kubeconfig.triggers.token
cluster_ca_certificate = base64decode(
null_resource.kubeconfig.triggers.cluster_ca_certificate
)
}
}
resource "helm_release" "ingress" {
name = "ingress"
chart = "ingress-nginx"
repository = "https://kubernetes.github.io/ingress-nginx"
namespace = "kube-system"
}
How would i go about configuring the nginx ingress server for routing to my kubernetes pools?
Following this tutorial,
https://learn.hashicorp.com/tutorials/terraform/gke?in=terraform/kubernetes
I have deployed a GKE cluster in GCloud.
Now when I try to schedule a deployment following this link,
https://learn.hashicorp.com/tutorials/terraform/kubernetes-provider
It fails with,
kubernetes_deployment.nginx: Creating...
Error: Failed to create deployment: Post "https://<ip>/apis/apps/v1/namespaces/default/deployments": x509: certificate signed by unknown authority
on kubernetes.tf line 21, in resource "kubernetes_deployment" "nginx":
21: resource "kubernetes_deployment" "nginx" {
My kubernetes.tf looks like this,
terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
}
}
}
provider "kubernetes" {
load_config_file = false
host = google_container_cluster.primary.endpoint
username = var.gke_username
password = var.gke_password
client_certificate = google_container_cluster.primary.master_auth.0.client_certificate
client_key = google_container_cluster.primary.master_auth.0.client_key
cluster_ca_certificate = google_container_cluster.primary.master_auth.0.cluster_ca_certificate
}
resource "kubernetes_deployment" "nginx" {
metadata {
name = "scalable-nginx-example"
labels = {
App = "ScalableNginxExample"
}
}
spec {
replicas = 2
selector {
match_labels = {
App = "ScalableNginxExample"
}
}
template {
metadata {
labels = {
App = "ScalableNginxExample"
}
}
spec {
container {
image = "nginx:1.7.8"
name = "example"
port {
container_port = 80
}
resources {
limits {
cpu = "0.5"
memory = "512Mi"
}
requests {
cpu = "250m"
memory = "50Mi"
}
}
}
}
}
}
}
I am using MacOS to run terraform. Any help is appreciated.
Please note that kubectl get pods --all-namespaces is working fine, so I don't think it's an issue with kube config.
Thanks,
Arun
It was because the certificate was base64 encoded, changing the provider section to the below snippet, got rid of the issue.
provider "kubernetes" {
load_config_file = false
host = google_container_cluster.primary.endpoint
username = var.gke_username
password = var.gke_password
client_certificate = base64decode(google_container_cluster.primary.master_auth.0.client_certificate)
client_key = base64decode(google_container_cluster.primary.master_auth.0.client_key)
cluster_ca_certificate = base64decode(google_container_cluster.primary.master_auth.0.cluster_ca_certificate)
}