Is there any way to inject environment variables to initializing/running init-containers when it is pre-loaded with your inferenceservice/deployment? - kubernetes
I am creating a inferenceservice instance using the given yaml file-
apiVersion: "serving.kubeflow.org/v1alpha2"
kind: "InferenceService"
metadata:
minReplicas: 1
name: "sklearn-iris"
spec:
default:
predictor:
sklearn:
storageUri: "gs://kfserving-samples/models/sklearn/iris"
Now this will create a deployment, and since I am working from behind the proxy I am injecting the env variables for proxies as:
kubectl set env deployment/sklearn-iris-predictor-default-dclkq-deployment -n kfserving-test http_proxy={http_proxy value}
kubectl set env deployment/sklearn-iris-predictor-default-dclkq-deployment -n kfserving-test https_proxy={https_proxy value}
kubectl set env deployment/sklearn-iris-predictor-default-dclkq-deployment -n kfserving-test no_proxy={no_proxy value}
Since I have fixed minimum replicas as 1 so it make sure that one pod is there even without the traffic, now when this pod is being made it runs 1 init-container and 2 containers, so doing the kubectl set env thing, the proxies are being set in the container variables but not in the init-container and it is failing, so overall things are failing.
So in crux is there any way to set proxy/env details in init-container, without having the availibility of whole deployment yaml file to configure the env?
Edit:
kubectl edit deploy/deployment_name -o yaml -n namespace gives
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
autoscaling.knative.dev/class: kpa.autoscaling.knative.dev
autoscaling.knative.dev/minScale: "1"
deployment.kubernetes.io/revision: "1"
internal.serving.kubeflow.org/storage-initializer-sourceuri: gs://kfserving-samples/models/sklearn/iris
serving.knative.dev/creator: system:serviceaccount:kfserving-system:default
creationTimestamp: "2021-02-03T06:51:29Z"
generation: 3
labels:
app: sklearn-iris-predictor-default-6xcgj
component: predictor
service.istio.io/canonical-name: sklearn-iris-predictor-default
service.istio.io/canonical-revision: sklearn-iris-predictor-default-6xcgj
serving.knative.dev/configuration: sklearn-iris-predictor-default
serving.knative.dev/configurationGeneration: "1"
serving.knative.dev/revision: sklearn-iris-predictor-default-6xcgj
serving.knative.dev/revisionUID: 470195f7-db41-4e9c-ac6b-c96c79a1218f
serving.knative.dev/service: sklearn-iris-predictor-default
serving.kubeflow.org/inferenceservice: sklearn-iris
name: sklearn-iris-predictor-default-6xcgj-deployment
namespace: kfserving-test
ownerReferences:
- apiVersion: serving.knative.dev/v1
blockOwnerDeletion: true
controller: true
kind: Revision
name: sklearn-iris-predictor-default-6xcgj
uid: 470195f7-db41-4e9c-ac6b-c96c79a1218f
resourceVersion: "633491"
selfLink: /apis/apps/v1/namespaces/kfserving-test/deployments/sklearn-iris-predictor-default-6xcgj-deployment
uid: 2fc10485-ba59-4eaf-b62a-480ecf4ab078
spec:
progressDeadlineSeconds: 120
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
serving.knative.dev/revisionUID: 470195f7-db41-4e9c-ac6b-c96c79a1218f
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
annotations:
autoscaling.knative.dev/class: kpa.autoscaling.knative.dev
autoscaling.knative.dev/minScale: "1"
internal.serving.kubeflow.org/storage-initializer-sourceuri: gs://kfserving-samples/models/sklearn/iris
serving.knative.dev/creator: system:serviceaccount:kfserving-system:default
creationTimestamp: null
labels:
app: sklearn-iris-predictor-default-6xcgj
component: predictor
service.istio.io/canonical-name: sklearn-iris-predictor-default
service.istio.io/canonical-revision: sklearn-iris-predictor-default-6xcgj
serving.knative.dev/configuration: sklearn-iris-predictor-default
serving.knative.dev/configurationGeneration: "1"
serving.knative.dev/revision: sklearn-iris-predictor-default-6xcgj
serving.knative.dev/revisionUID: 470195f7-db41-4e9c-ac6b-c96c79a1218f
serving.knative.dev/service: sklearn-iris-predictor-default
serving.kubeflow.org/inferenceservice: sklearn-iris
spec:
containers:
- args:
- --model_name=sklearn-iris
- --model_dir=/mnt/models
- --http_port=8080
- --workers=0
env:
- name: http_proxy
value: {proxy data}
- name: https_proxy
value: {proxy data}
- name: no_proxy
value: {no proxy data}
- name: PORT
value: "8080"
- name: K_REVISION
value: sklearn-iris-predictor-default-6xcgj
- name: K_CONFIGURATION
value: sklearn-iris-predictor-default
- name: K_SERVICE
value: sklearn-iris-predictor-default
- name: K_INTERNAL_POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: K_INTERNAL_POD_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: gcr.io/kfserving/sklearnserver#sha256:fd87e984a6092aae6efd28a2d596aac16d83d207a0269a503a221cb24cfd2f39
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
httpGet:
path: /wait-for-drain
port: 8022
scheme: HTTP
name: kfserving-container
ports:
- containerPort: 8080
name: user-port
protocol: TCP
resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /var/log
name: knative-var-log
subPathExpr: $(K_INTERNAL_POD_NAMESPACE)_$(K_INTERNAL_POD_NAME)_kfserving-container
- env:
- name: SERVING_NAMESPACE
value: kfserving-test
- name: SERVING_SERVICE
value: sklearn-iris-predictor-default
- name: SERVING_CONFIGURATION
value: sklearn-iris-predictor-default
- name: SERVING_REVISION
value: sklearn-iris-predictor-default-6xcgj
- name: QUEUE_SERVING_PORT
value: "8012"
- name: CONTAINER_CONCURRENCY
value: "0"
- name: REVISION_TIMEOUT_SECONDS
value: "300"
- name: SERVING_POD
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: SERVING_POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: SERVING_LOGGING_CONFIG
value: |-
{
"level": "info",
"development": false,
"outputPaths": ["stdout"],
"errorOutputPaths": ["stderr"],
"encoding": "json",
"encoderConfig": {
"timeKey": "ts",
"levelKey": "level",
"nameKey": "logger",
"callerKey": "caller",
"messageKey": "msg",
"stacktraceKey": "stacktrace",
"lineEnding": "",
"levelEncoder": "",
"timeEncoder": "iso8601",
"durationEncoder": "",
"callerEncoder": ""
}
}
- name: SERVING_LOGGING_LEVEL
- name: SERVING_REQUEST_LOG_TEMPLATE
value: '{"httpRequest": {"requestMethod": "{{.Request.Method}}", "requestUrl":
"{{js .Request.RequestURI}}", "requestSize": "{{.Request.ContentLength}}",
"status": {{.Response.Code}}, "responseSize": "{{.Response.Size}}", "userAgent":
"{{js .Request.UserAgent}}", "remoteIp": "{{js .Request.RemoteAddr}}",
"serverIp": "{{.Revision.PodIP}}", "referer": "{{js .Request.Referer}}",
"latency": "{{.Response.Latency}}s", "protocol": "{{.Request.Proto}}"},
"traceId": "{{index .Request.Header "X-B3-Traceid"}}"}'
- name: SERVING_ENABLE_REQUEST_LOG
value: "false"
- name: SERVING_REQUEST_METRICS_BACKEND
value: prometheus
- name: TRACING_CONFIG_BACKEND
value: none
- name: TRACING_CONFIG_ZIPKIN_ENDPOINT
- name: TRACING_CONFIG_STACKDRIVER_PROJECT_ID
- name: TRACING_CONFIG_DEBUG
value: "false"
- name: TRACING_CONFIG_SAMPLE_RATE
value: "0.1"
- name: USER_PORT
value: "8080"
- name: SYSTEM_NAMESPACE
value: knative-serving
- name: METRICS_DOMAIN
value: knative.dev/internal/serving
- name: SERVING_READINESS_PROBE
value: '{"tcpSocket":{"port":8080,"host":"127.0.0.1"},"successThreshold":1}'
- name: ENABLE_PROFILING
value: "false"
- name: SERVING_ENABLE_PROBE_REQUEST_LOG
value: "false"
- name: METRICS_COLLECTOR_ADDRESS
image: gcr.io/knative-releases/knative.dev/serving/cmd/queue#sha256:0db974f58b48b219ab8047e11b481c2bbda52b7a2d54db5ed58e8659748ec125
imagePullPolicy: IfNotPresent
name: queue-proxy
ports:
- containerPort: 8022
name: http-queueadm
protocol: TCP
- containerPort: 9090
name: http-autometric
protocol: TCP
- containerPort: 9091
name: http-usermetric
protocol: TCP
- containerPort: 8012
name: queue-port
protocol: TCP
readinessProbe:
exec:
command:
- /ko-app/queue
- -probe-period
- "0"
failureThreshold: 3
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 10
resources:
requests:
cpu: 25m
securityContext:
allowPrivilegeEscalation: false
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 300
volumes:
- emptyDir: {}
name: knative-var-log
status:
conditions:
- lastTransitionTime: "2021-02-03T06:51:29Z"
lastUpdateTime: "2021-02-03T06:51:29Z"
message: Deployment does not have minimum availability.
reason: MinimumReplicasUnavailable
status: "False"
type: Available
- lastTransitionTime: "2021-02-03T07:38:08Z"
lastUpdateTime: "2021-02-03T07:38:08Z"
message: ReplicaSet "sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96"
has timed out progressing.
reason: ProgressDeadlineExceeded
status: "False"
type: Progressing
observedGeneration: 2
replicas: 1
unavailableReplicas: 1
updatedReplicas: 1
kubectl describe pod podname -n namespace gives
Name: sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96vqbgr
Namespace: kfserving-test
Priority: 0
Node: minikube/192.168.99.109
Start Time: Wed, 03 Feb 2021 13:50:33 +0530
Labels: app=sklearn-iris-predictor-default-6xcgj
component=predictor
pod-template-hash=7c97895d96
service.istio.io/canonical-name=sklearn-iris-predictor-default
service.istio.io/canonical-revision=sklearn-iris-predictor-default-6xcgj
serving.knative.dev/configuration=sklearn-iris-predictor-default
serving.knative.dev/configurationGeneration=1
serving.knative.dev/revision=sklearn-iris-predictor-default-6xcgj
serving.knative.dev/revisionUID=470195f7-db41-4e9c-ac6b-c96c79a1218f
serving.knative.dev/service=sklearn-iris-predictor-default
serving.kubeflow.org/inferenceservice=sklearn-iris
Annotations: autoscaling.knative.dev/class: kpa.autoscaling.knative.dev
autoscaling.knative.dev/minScale: 1
internal.serving.kubeflow.org/storage-initializer-sourceuri: gs://kfserving-samples/models/sklearn/iris
serving.knative.dev/creator: system:serviceaccount:kfserving-system:default
Status: Pending
IP: 172.17.0.22
Controlled By: ReplicaSet/sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96
Init Containers:
storage-initializer:
Container ID: docker://262a195f39fad7dfc62b494d9c9bbda8c7cdeee2f4b903b2948b809c5e00fb0c
Image: gcr.io/kfserving/storage-initializer:v0.5.0-rc2
Image ID: docker-pullable://gcr.io/kfserving/storage-initializer#sha256:9a16e6af385412bb62fd7e09f6d749e107e3ad92c488039acd20361fb5dd68cc
Port: <none>
Host Port: <none>
Args:
gs://kfserving-samples/models/sklearn/iris
/mnt/models
State: Running
Started: Wed, 03 Feb 2021 13:58:00 +0530
Last State: Terminated
Reason: Error
Message: ownload(src_uri, dest_path)
File "/usr/local/lib/python3.7/site-packages/kfserving/storage.py", line 58, in download
Storage._download_gcs(uri, out_dir)
File "/usr/local/lib/python3.7/site-packages/kfserving/storage.py", line 116, in _download_gcs
for blob in blobs:
File "/usr/local/lib/python3.7/site-packages/google/api_core/page_iterator.py", line 212, in _items_iter
for page in self._page_iter(increment=False):
File "/usr/local/lib/python3.7/site-packages/google/api_core/page_iterator.py", line 243, in _page_iter
page = self._next_page()
File "/usr/local/lib/python3.7/site-packages/google/api_core/page_iterator.py", line 369, in _next_page
response = self._get_next_page_response()
File "/usr/local/lib/python3.7/site-packages/google/api_core/page_iterator.py", line 419, in _get_next_page_response
method=self._HTTP_METHOD, path=self.path, query_params=params
File "/usr/local/lib/python3.7/site-packages/google/cloud/storage/_http.py", line 63, in api_request
return call()
File "/usr/local/lib/python3.7/site-packages/google/api_core/retry.py", line 286, in retry_wrapped_func
on_error=on_error,
File "/usr/local/lib/python3.7/site-packages/google/api_core/retry.py", line 206, in retry_target
last_exc,
File "<string>", line 3, in raise_from
google.api_core.exceptions.RetryError: Deadline of 120.0s exceeded while calling functools.partial(functools.partial(<bound method JSONConnection.api_request of <google.cloud.storage._http.Connection object at 0x7fd57c954c50>>, timeout=60, method='GET', path='/b/kfserving-samples/o', query_params={'projection': 'noAcl', 'prefix': 'models/sklearn/iris/'})), last exception: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /storage/v1/b/kfserving-samples/o?projection=noAcl&prefix=models%2Fsklearn%2Firis%2F&prettyPrint=false (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fd57c91bb90>: Failed to establish a new connection: [Errno 113] No route to host'))
Exit Code: 1
Started: Wed, 03 Feb 2021 13:53:53 +0530
Finished: Wed, 03 Feb 2021 13:57:45 +0530
Ready: False
Restart Count: 2
Limits:
cpu: 1
memory: 1Gi
Requests:
cpu: 100m
memory: 100Mi
Environment: <none>
Mounts:
/mnt/models from kfserving-provision-location (rw)
/var/run/secrets/kubernetes.io/serviceaccount from default-token-hw2rw (ro)
Containers:
kfserving-container:
Container ID:
Image: gcr.io/kfserving/sklearnserver#sha256:fd87e984a6092aae6efd28a2d596aac16d83d207a0269a503a221cb24cfd2f39
Image ID:
Port: 8080/TCP
Host Port: 0/TCP
Args:
--model_name=sklearn-iris
--model_dir=/mnt/models
--http_port=8080
--workers=0
State: Waiting
Reason: PodInitializing
Ready: False
Restart Count: 0
Limits:
cpu: 1
memory: 2Gi
Requests:
cpu: 1
memory: 2Gi
Environment:
PORT: 8080
K_REVISION: sklearn-iris-predictor-default-6xcgj
K_CONFIGURATION: sklearn-iris-predictor-default
K_SERVICE: sklearn-iris-predictor-default
K_INTERNAL_POD_NAME: sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96vqbgr (v1:metadata.name)
K_INTERNAL_POD_NAMESPACE: kfserving-test (v1:metadata.namespace)
Mounts:
/mnt/models from kfserving-provision-location (ro)
/var/log from knative-var-log (rw)
/var/run/secrets/kubernetes.io/serviceaccount from default-token-hw2rw (ro)
queue-proxy:
Container ID:
Image: gcr.io/knative-releases/knative.dev/serving/cmd/queue#sha256:0db974f58b48b219ab8047e11b481c2bbda52b7a2d54db5ed58e8659748ec125
Image ID:
Ports: 8022/TCP, 9090/TCP, 9091/TCP, 8012/TCP
Host Ports: 0/TCP, 0/TCP, 0/TCP, 0/TCP
State: Waiting
Reason: PodInitializing
Ready: False
Restart Count: 0
Requests:
cpu: 25m
Readiness: exec [/ko-app/queue -probe-period 0] delay=0s timeout=10s period=10s #success=1 #failure=3
Environment:
SERVING_NAMESPACE: kfserving-test
SERVING_SERVICE: sklearn-iris-predictor-default
SERVING_CONFIGURATION: sklearn-iris-predictor-default
SERVING_REVISION: sklearn-iris-predictor-default-6xcgj
QUEUE_SERVING_PORT: 8012
CONTAINER_CONCURRENCY: 0
REVISION_TIMEOUT_SECONDS: 300
SERVING_POD: sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96vqbgr (v1:metadata.name)
SERVING_POD_IP: (v1:status.podIP)
SERVING_LOGGING_CONFIG: {
"level": "info",
"development": false,
"outputPaths": ["stdout"],
"errorOutputPaths": ["stderr"],
"encoding": "json",
"encoderConfig": {
"timeKey": "ts",
"levelKey": "level",
"nameKey": "logger",
"callerKey": "caller",
"messageKey": "msg",
"stacktraceKey": "stacktrace",
"lineEnding": "",
"levelEncoder": "",
"timeEncoder": "iso8601",
"durationEncoder": "",
"callerEncoder": ""
}
}
SERVING_LOGGING_LEVEL:
SERVING_REQUEST_LOG_TEMPLATE: {"httpRequest": {"requestMethod": "{{.Request.Method}}", "requestUrl": "{{js .Request.RequestURI}}", "requestSize": "{{.Request.ContentLength}}", "status": {{.Response.Code}}, "responseSize": "{{.Response.Size}}", "userAgent": "{{js .Request.UserAgent}}", "remoteIp": "{{js .Request.RemoteAddr}}", "serverIp": "{{.Revision.PodIP}}", "referer": "{{js .Request.Referer}}", "latency": "{{.Response.Latency}}s", "protocol": "{{.Request.Proto}}"}, "traceId": "{{index .Request.Header "X-B3-Traceid"}}"}
SERVING_ENABLE_REQUEST_LOG: false
SERVING_REQUEST_METRICS_BACKEND: prometheus
TRACING_CONFIG_BACKEND: none
TRACING_CONFIG_ZIPKIN_ENDPOINT:
TRACING_CONFIG_STACKDRIVER_PROJECT_ID:
TRACING_CONFIG_DEBUG: false
TRACING_CONFIG_SAMPLE_RATE: 0.1
USER_PORT: 8080
SYSTEM_NAMESPACE: knative-serving
METRICS_DOMAIN: knative.dev/internal/serving
SERVING_READINESS_PROBE: {"tcpSocket":{"port":8080,"host":"127.0.0.1"},"successThreshold":1}
ENABLE_PROFILING: false
SERVING_ENABLE_PROBE_REQUEST_LOG: false
METRICS_COLLECTOR_ADDRESS:
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-hw2rw (ro)
Conditions:
Type Status
Initialized False
Ready False
ContainersReady False
PodScheduled True
Volumes:
knative-var-log:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium:
SizeLimit: <unset>
default-token-hw2rw:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-hw2rw
Optional: false
kfserving-provision-location:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium:
SizeLimit: <unset>
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 7m50s default-scheduler Successfully assigned kfserving-test/sklearn-iris-predictor-default-6xcgj-deployment-7c97895d96vqbgr to minikube
Warning BackOff 36s kubelet, minikube Back-off restarting failed container
Normal Pulled 24s (x3 over 7m47s) kubelet, minikube Container image "gcr.io/kfserving/storage-initializer:v0.5.0-rc2" already present on machine
Normal Created 23s (x3 over 7m47s) kubelet, minikube Created container storage-initializer
Normal Started 23s (x3 over 7m46s) kubelet, minikube Started container storage-initializer
I don't think it is possible to do the same via kubectl set env command.
When I tried to run the updated command on my local setup passing the initcontainer name, it returns with the message:
warning: Deployment/dummy does not have any containers matching "dummy-init"
Command used:
kubectl set env -n dummy-ns deploy/dummy -c "dummy-init" dummy_env="true"
You can however use the kubectl edit command which will open the full yaml in edit mode and you can add the required environment variable to whichever container you need and save the spec. This will create a new pod with the new spec.
kubectl edit -n dummy-ns deploy/dummy -o yaml
Related
Prometheus & Alert Manager keeps crashing after updating the EKS version to 1.16
prometheus-prometheus-kube-prometheus-prometheus-0 0/2 Terminating 0 4s alertmanager-prometheus-kube-prometheus-alertmanager-0 0/2 Terminating 0 10s After updating EKS cluster to 1.16 from 1.15 everything works fine except these two pods, they keep on terminating and unable to initialise. Hence, prometheus monitoring does not work. I am getting below errors while describing the pods. Error: failed to start container "prometheus": Error response from daemon: OCI runtime create failed: container_linux.go:362: creating new parent process caused: container_linux.go:1941: running lstat on namespace path "/proc/29271/ns/ipc" caused: lstat /proc/29271/ns/ipc: no such file or directory: unknown Error: failed to start container "config-reloader": Error response from daemon: cannot join network of a non running container: 7e139521980afd13dad0162d6859352b0b2c855773d6d4062ee3e2f7f822a0b3 Error: cannot find volume "config" to mount into container "config-reloader" Error: cannot find volume "config" to mount into container "prometheus" here is my yaml file for the deployment: apiVersion: v1 kind: Pod metadata: annotations: kubernetes.io/psp: eks.privileged creationTimestamp: "2021-04-30T16:39:14Z" deletionGracePeriodSeconds: 600 deletionTimestamp: "2021-04-30T16:49:14Z" generateName: prometheus-prometheus-kube-prometheus-prometheus- labels: app: prometheus app.kubernetes.io/instance: prometheus-kube-prometheus-prometheus app.kubernetes.io/managed-by: prometheus-operator app.kubernetes.io/name: prometheus app.kubernetes.io/version: 2.26.0 controller-revision-hash: prometheus-prometheus-kube-prometheus-prometheus-56d9fcf57 operator.prometheus.io/name: prometheus-kube-prometheus-prometheus operator.prometheus.io/shard: "0" prometheus: prometheus-kube-prometheus-prometheus statefulset.kubernetes.io/pod-name: prometheus-prometheus-kube-prometheus-prometheus-0 name: prometheus-prometheus-kube-prometheus-prometheus-0 namespace: mo ownerReferences: - apiVersion: apps/v1 blockOwnerDeletion: true controller: true kind: StatefulSet name: prometheus-prometheus-kube-prometheus-prometheus uid: 326a09f2-319c-449d-904a-1dd0019c6d80 resourceVersion: "9337443" selfLink: /api/v1/namespaces/monitoring/pods/prometheus-prometheus-kube-prometheus-prometheus-0 uid: e2be062f-749d-488e-a6cc-42ef1396851b spec: containers: - args: - --web.console.templates=/etc/prometheus/consoles - --web.console.libraries=/etc/prometheus/console_libraries - --config.file=/etc/prometheus/config_out/prometheus.env.yaml - --storage.tsdb.path=/prometheus - --storage.tsdb.retention.time=10d - --web.enable-lifecycle - --storage.tsdb.no-lockfile - --web.external-url=http://prometheus-kube-prometheus-prometheus.monitoring:9090 - --web.route-prefix=/ image: quay.io/prometheus/prometheus:v2.26.0 imagePullPolicy: IfNotPresent name: prometheus ports: - containerPort: 9090 name: web protocol: TCP readinessProbe: failureThreshold: 120 httpGet: path: /-/ready port: web scheme: HTTP periodSeconds: 5 successThreshold: 1 timeoutSeconds: 3 resources: {} terminationMessagePath: /dev/termination-log terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /etc/prometheus/config_out name: config-out readOnly: true - mountPath: /etc/prometheus/certs name: tls-assets readOnly: true - mountPath: /prometheus name: prometheus-prometheus-kube-prometheus-prometheus-db - mountPath: /etc/prometheus/rules/prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 name: prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: prometheus-kube-prometheus-prometheus-token-mh66q readOnly: true - args: - --listen-address=:8080 - --reload-url=http://localhost:9090/-/reload - --config-file=/etc/prometheus/config/prometheus.yaml.gz - --config-envsubst-file=/etc/prometheus/config_out/prometheus.env.yaml - --watched-dir=/etc/prometheus/rules/prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 command: - /bin/prometheus-config-reloader env: - name: POD_NAME valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name - name: SHARD value: "0" image: quay.io/prometheus-operator/prometheus-config-reloader:v0.47.0 imagePullPolicy: IfNotPresent name: config-reloader ports: - containerPort: 8080 name: reloader-web protocol: TCP resources: limits: cpu: 100m memory: 50Mi requests: cpu: 100m memory: 50Mi terminationMessagePath: /dev/termination-log terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /etc/prometheus/config name: config - mountPath: /etc/prometheus/config_out name: config-out - mountPath: /etc/prometheus/rules/prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 name: prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: prometheus-kube-prometheus-prometheus-token-mh66q readOnly: true dnsPolicy: ClusterFirst enableServiceLinks: true hostname: prometheus-prometheus-kube-prometheus-prometheus-0 nodeName: ip-10-1-49-45.ec2.internal priority: 0 restartPolicy: Always schedulerName: default-scheduler securityContext: fsGroup: 2000 runAsGroup: 2000 runAsNonRoot: true runAsUser: 1000 serviceAccount: prometheus-kube-prometheus-prometheus serviceAccountName: prometheus-kube-prometheus-prometheus subdomain: prometheus-operated terminationGracePeriodSeconds: 600 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists tolerationSeconds: 300 - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists tolerationSeconds: 300 volumes: - name: config secret: defaultMode: 420 secretName: prometheus-prometheus-kube-prometheus-prometheus - name: tls-assets secret: defaultMode: 420 secretName: prometheus-prometheus-kube-prometheus-prometheus-tls-assets - emptyDir: {} name: config-out - configMap: defaultMode: 420 name: prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 name: prometheus-prometheus-kube-prometheus-prometheus-rulefiles-0 - emptyDir: {} name: prometheus-prometheus-kube-prometheus-prometheus-db - name: prometheus-kube-prometheus-prometheus-token-mh66q secret: defaultMode: 420 secretName: prometheus-kube-prometheus-prometheus-token-mh66q status: conditions: - lastProbeTime: null lastTransitionTime: "2021-04-30T16:39:14Z" status: "True" type: PodScheduled phase: Pending qosClass: Burstable
If someone needs to know the answer, in my case(the above situation) there were 2 Prometheus operators running in different different namespace, 1 in default & another monitoring namespace. so I removed the one from the default namespace and it resolved my pods crashing issue.
Unable to deploy mongodb community operator in openshift
I'm trying to deploy the mongodb community operator in openshift 3.11, using the following commands: git clone https://github.com/mongodb/mongodb-kubernetes-operator.git cd mongodb-kubernetes-operator oc new-project mongodb oc create -f deploy/crds/mongodb.com_mongodb_crd.yaml -n mongodb oc create -f deploy/operator/role.yaml -n mongodb oc create -f deploy/operator/role_binding.yaml -n mongodb oc create -f deploy/operator/service_account.yaml -n mongodb oc apply -f deploy/openshift/operator_openshift.yaml -n mongodb oc apply -f deploy/crds/mongodb.com_v1_mongodb_openshift_cr.yaml -n mongodb Operator pod is successfully running, but the mongodb replicaset pods do not spin up. Error is as follows: [kubenode#master mongodb-kubernetes-operator]$ oc get pods NAME READY STATUS RESTARTS AGE example-openshift-mongodb-0 1/2 CrashLoopBackOff 4 2m mongodb-kubernetes-operator-66bfcbcf44-9xvj7 1/1 Running 0 2m [kubenode#master mongodb-kubernetes-operator]$ oc logs -f example-openshift-mongodb-0 -c mongodb-agent panic: Failed to get current user: user: unknown userid 1000510000 goroutine 1 [running]: com.tengen/cm/util.init.3() /data/mci/2f46ec94982c5440960d2b2bf2b6ae15/mms-automation/build/go-dependencies/src/com.tengen/cm/util/user.go:14 +0xe5 I have gone through all the issues raised on the mongodb-kubernetes-operator repository which are related to this issue (reference), and found a suggestion to set the MANAGED_SECURITY_CONTEXT environment variable to true in the operator, mongodb and mongodb-agent containers. I have done so for all of these containers, but am still facing the same issue. Here is the confirmation that the environment variables are correctly set: [kubenode#master mongodb-kubernetes-operator]$ oc set env statefulset.apps/example-openshift-mongodb --list # statefulsets/example-openshift-mongodb, container mongodb-agent AGENT_STATUS_FILEPATH=/var/log/mongodb-mms-automation/healthstatus/agent-health-status.json AUTOMATION_CONFIG_MAP=example-openshift-mongodb-config HEADLESS_AGENT=true MANAGED_SECURITY_CONTEXT=true # POD_NAMESPACE from field path metadata.namespace # statefulsets/example-openshift-mongodb, container mongod AGENT_STATUS_FILEPATH=/healthstatus/agent-health-status.json MANAGED_SECURITY_CONTEXT=true [kubenode#master mongodb-kubernetes-operator]$ oc set env deployment.apps/mongodb-kubernetes-operator --list # deployments/mongodb-kubernetes-operator, container mongodb-kubernetes-operator # WATCH_NAMESPACE from field path metadata.namespace # POD_NAME from field path metadata.name MANAGED_SECURITY_CONTEXT=true OPERATOR_NAME=mongodb-kubernetes-operator AGENT_IMAGE=quay.io/mongodb/mongodb-agent:10.19.0.6562-1 VERSION_UPGRADE_HOOK_IMAGE=quay.io/mongodb/mongodb-kubernetes-operator-version-upgrade-post-start-hook:1.0.2 Operator Information Operator Version: 0.3.0 MongoDB Image used: 4.2.6 Cluster Information [kubenode#master mongodb-kubernetes-operator]$ openshift version openshift v3.11.0+62803d0-1 [kubenode#master mongodb-kubernetes-operator]$ kubectl version Client Version: version.Info{Major:"1", Minor:"11+", GitVersion:"v1.11.0+d4cacc0", GitCommit:"d4cacc0", GitTreeState:"clean", BuildDate:"2018-10-15T09:45:30Z", GoVersion:"go1.10.2", Compiler:"gc", Platform:"linux/amd64"} Server Version: version.Info{Major:"1", Minor:"11+", GitVersion:"v1.11.0+d4cacc0", GitCommit:"d4cacc0", GitTreeState:"clean", BuildDate:"2020-12-07T17:59:40Z", GoVersion:"go1.10.8", Compiler:"gc", Platform:"linux/amd64"} Update When I check the replica pod yaml (see below), I see three occurrences of runAsUser security context set as 1000510000. I'm not sure how, but this is being set even though I'm not setting it manually. [kubenode#master mongodb-kubernetes-operator]$ oc get -o yaml pod example-openshift-mongodb-0 apiVersion: v1 kind: Pod metadata: annotations: openshift.io/scc: restricted creationTimestamp: 2021-01-19T07:45:05Z generateName: example-openshift-mongodb- labels: app: example-openshift-mongodb-svc controller-revision-hash: example-openshift-mongodb-6549495b statefulset.kubernetes.io/pod-name: example-openshift-mongodb-0 name: example-openshift-mongodb-0 namespace: mongodb ownerReferences: - apiVersion: apps/v1 blockOwnerDeletion: true controller: true kind: StatefulSet name: example-openshift-mongodb uid: 3e91eb40-5a2a-11eb-a5e0-0050569b1f59 resourceVersion: "15616863" selfLink: /api/v1/namespaces/mongodb/pods/example-openshift-mongodb-0 uid: 3ea17a28-5a2a-11eb-a5e0-0050569b1f59 spec: containers: - command: - agent/mongodb-agent - -cluster=/var/lib/automation/config/cluster-config.json - -skipMongoStart - -noDaemonize - -healthCheckFilePath=/var/log/mongodb-mms-automation/healthstatus/agent-health-status.json - -serveStatusPort=5000 - -useLocalMongoDbTools env: - name: AGENT_STATUS_FILEPATH value: /var/log/mongodb-mms-automation/healthstatus/agent-health-status.json - name: AUTOMATION_CONFIG_MAP value: example-openshift-mongodb-config - name: HEADLESS_AGENT value: "true" - name: MANAGED_SECURITY_CONTEXT value: "true" - name: POD_NAMESPACE valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.namespace image: quay.io/mongodb/mongodb-agent:10.19.0.6562-1 imagePullPolicy: Always name: mongodb-agent readinessProbe: exec: command: - /var/lib/mongodb-mms-automation/probes/readinessprobe failureThreshold: 60 initialDelaySeconds: 5 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 resources: {} securityContext: capabilities: drop: - KILL - MKNOD - SETGID - SETUID runAsUser: 1000510000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /var/lib/automation/config name: automation-config readOnly: true - mountPath: /data name: data-volume - mountPath: /var/lib/mongodb-mms-automation/authentication name: example-openshift-mongodb-agent-scram-credentials - mountPath: /var/log/mongodb-mms-automation/healthstatus name: healthstatus - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: mongodb-kubernetes-operator-token-lr9l4 readOnly: true - command: - /bin/sh - -c - |2 # run post-start hook to handle version changes /hooks/version-upgrade # wait for config to be created by the agent while [ ! -f /data/automation-mongod.conf ]; do sleep 3 ; done ; sleep 2 ; # start mongod with this configuration exec mongod -f /data/automation-mongod.conf ; env: - name: AGENT_STATUS_FILEPATH value: /healthstatus/agent-health-status.json - name: MANAGED_SECURITY_CONTEXT value: "true" image: mongo:4.2.6 imagePullPolicy: IfNotPresent name: mongod resources: {} securityContext: capabilities: drop: - KILL - MKNOD - SETGID - SETUID runAsUser: 1000510000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /data name: data-volume - mountPath: /var/lib/mongodb-mms-automation/authentication name: example-openshift-mongodb-agent-scram-credentials - mountPath: /healthstatus name: healthstatus - mountPath: /hooks name: hooks - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: mongodb-kubernetes-operator-token-lr9l4 readOnly: true dnsPolicy: ClusterFirst hostname: example-openshift-mongodb-0 imagePullSecrets: - name: mongodb-kubernetes-operator-dockercfg-jhplw initContainers: - command: - cp - version-upgrade-hook - /hooks/version-upgrade image: quay.io/mongodb/mongodb-kubernetes-operator-version-upgrade-post-start-hook:1.0.2 imagePullPolicy: Always name: mongod-posthook resources: {} securityContext: capabilities: drop: - KILL - MKNOD - SETGID - SETUID runAsUser: 1000510000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /hooks name: hooks - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: mongodb-kubernetes-operator-token-lr9l4 readOnly: true nodeName: node1.192.168.27.116.nip.io nodeSelector: node-role.kubernetes.io/compute: "true" priority: 0 restartPolicy: Always schedulerName: default-scheduler securityContext: fsGroup: 1000510000 seLinuxOptions: level: s0:c23,c2 serviceAccount: mongodb-kubernetes-operator serviceAccountName: mongodb-kubernetes-operator subdomain: example-openshift-mongodb-svc terminationGracePeriodSeconds: 30 volumes: - name: data-volume persistentVolumeClaim: claimName: data-volume-example-openshift-mongodb-0 - name: automation-config secret: defaultMode: 416 secretName: example-openshift-mongodb-config - name: example-openshift-mongodb-agent-scram-credentials secret: defaultMode: 384 secretName: example-openshift-mongodb-agent-scram-credentials - emptyDir: {} name: healthstatus - emptyDir: {} name: hooks - name: mongodb-kubernetes-operator-token-lr9l4 secret: defaultMode: 420 secretName: mongodb-kubernetes-operator-token-lr9l4 status: conditions: - lastProbeTime: null lastTransitionTime: 2021-01-19T07:46:45Z status: "True" type: Initialized - lastProbeTime: null lastTransitionTime: 2021-01-19T07:46:39Z message: 'containers with unready status: [mongodb-agent]' reason: ContainersNotReady status: "False" type: Ready - lastProbeTime: null lastTransitionTime: null message: 'containers with unready status: [mongodb-agent]' reason: ContainersNotReady status: "False" type: ContainersReady - lastProbeTime: null lastTransitionTime: 2021-01-19T07:45:05Z status: "True" type: PodScheduled containerStatuses: - containerID: docker://bd3ede9178bb78267bc19d1b5da0915d3bcd1d4dcee3e142c7583424bd2aa777 image: docker.io/mongo:4.2.6 imageID: docker-pullable://docker.io/mongo#sha256:c880f6b56f443bb4d01baa759883228cd84fa8d78fa1a36001d1c0a0712b5a07 lastState: {} name: mongod ready: true restartCount: 0 state: running: startedAt: 2021-01-19T07:46:55Z - containerID: docker://5e39c0b6269b8231bbf9cabb4ff3457d9f91e878eff23953e318a9475fb8a90e image: quay.io/mongodb/mongodb-agent:10.19.0.6562-1 imageID: docker-pullable://quay.io/mongodb/mongodb-agent#sha256:790c2670ef7cefd61cfaabaf739de16dbd2e07dc3b539add0da21ab7d5ac7626 lastState: terminated: containerID: docker://5e39c0b6269b8231bbf9cabb4ff3457d9f91e878eff23953e318a9475fb8a90e exitCode: 2 finishedAt: 2021-01-19T19:39:58Z reason: Error startedAt: 2021-01-19T19:39:58Z name: mongodb-agent ready: false restartCount: 144 state: waiting: message: Back-off 5m0s restarting failed container=mongodb-agent pod=example-openshift-mongodb-0_mongodb(3ea17a28-5a2a-11eb-a5e0-0050569b1f59) reason: CrashLoopBackOff hostIP: 192.168.27.116 initContainerStatuses: - containerID: docker://7c31cef2a68e3e6100c2cc9c83e3780313f1e8ab43bebca79ad4d48613f124bd image: quay.io/mongodb/mongodb-kubernetes-operator-version-upgrade-post-start-hook:1.0.2 imageID: docker-pullable://quay.io/mongodb/mongodb-kubernetes-operator-version-upgrade-post-start-hook#sha256:e99105b1c54e12913ddaf470af8025111a6e6e4c8917fc61be71d1bc0328e7d7 lastState: {} name: mongod-posthook ready: true restartCount: 0 state: terminated: containerID: docker://7c31cef2a68e3e6100c2cc9c83e3780313f1e8ab43bebca79ad4d48613f124bd exitCode: 0 finishedAt: 2021-01-19T07:46:45Z reason: Completed startedAt: 2021-01-19T07:46:44Z phase: Running podIP: 10.129.0.119 qosClass: BestEffort startTime: 2021-01-19T07:46:39Z
Why Istio "Authentication Policy" Example Page isn't working as expected?
The article here: https://istio.io/docs/tasks/security/authn-policy/ Specifically, when I follow the instruction on the Setup section, I can't connect any httpbin that are residing in namespace foo and bar. But the legacy's one is okay. I expect there is something wrong in the side car proxy being installed. Here is the output of httpbin pod yaml file (after being injected with istioctl kubeinject --includeIPRanges "10.32.0.0/16" command). I use --includeIPRanges so that the pod can communicate with external ip (for my debugging purpose to install dnsutils, etc package) apiVersion: v1 kind: Pod metadata: annotations: sidecar.istio.io/inject: "true" sidecar.istio.io/status: '{"version":"4120ea817406fd7ed43b7ecf3f2e22abe453c44d3919389dcaff79b210c4cd86","initContainers":["istio-init"],"containers":["istio-proxy"],"volumes":["istio-envoy","istio-certs"],"imagePullSecrets":null}' creationTimestamp: 2018-08-15T11:40:59Z generateName: httpbin-8b9cf99f5- labels: app: httpbin pod-template-hash: "465795591" version: v1 name: httpbin-8b9cf99f5-9c47z namespace: foo ownerReferences: - apiVersion: extensions/v1beta1 blockOwnerDeletion: true controller: true kind: ReplicaSet name: httpbin-8b9cf99f5 uid: 1450d75d-a080-11e8-aece-42010a940168 resourceVersion: "65722138" selfLink: /api/v1/namespaces/foo/pods/httpbin-8b9cf99f5-9c47z uid: 1454b68d-a080-11e8-aece-42010a940168 spec: containers: - image: docker.io/citizenstig/httpbin imagePullPolicy: IfNotPresent name: httpbin ports: - containerPort: 8000 protocol: TCP resources: {} terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: default-token-pkpvf readOnly: true - args: - proxy - sidecar - --configPath - /etc/istio/proxy - --binaryPath - /usr/local/bin/envoy - --serviceCluster - httpbin - --drainDuration - 45s - --parentShutdownDuration - 1m0s - --discoveryAddress - istio-pilot.istio-system:15007 - --discoveryRefreshDelay - 1s - --zipkinAddress - zipkin.istio-system:9411 - --connectTimeout - 10s - --statsdUdpAddress - istio-statsd-prom-bridge.istio-system.istio-system:9125 - --proxyAdminPort - "15000" - --controlPlaneAuthPolicy - NONE env: - name: POD_NAME valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - name: INSTANCE_IP valueFrom: fieldRef: apiVersion: v1 fieldPath: status.podIP - name: ISTIO_META_POD_NAME valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name - name: ISTIO_META_INTERCEPTION_MODE value: REDIRECT image: docker.io/istio/proxyv2:1.0.0 imagePullPolicy: IfNotPresent name: istio-proxy resources: requests: cpu: 10m securityContext: privileged: false readOnlyRootFilesystem: true runAsUser: 1337 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /etc/istio/proxy name: istio-envoy - mountPath: /etc/certs/ name: istio-certs readOnly: true dnsPolicy: ClusterFirst initContainers: - args: - -p - "15001" - -u - "1337" - -m - REDIRECT - -i - 10.32.0.0/16 - -x - "" - -b - 8000, - -d - "" image: docker.io/istio/proxy_init:1.0.0 imagePullPolicy: IfNotPresent name: istio-init resources: {} securityContext: capabilities: add: - NET_ADMIN privileged: true terminationMessagePath: /dev/termination-log terminationMessagePolicy: File nodeName: gke-tvlk-data-dev-default-medium-pool-46397778-q2sb restartPolicy: Always schedulerName: default-scheduler securityContext: {} serviceAccount: default serviceAccountName: default terminationGracePeriodSeconds: 30 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists tolerationSeconds: 300 - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists tolerationSeconds: 300 volumes: - name: default-token-pkpvf secret: defaultMode: 420 secretName: default-token-pkpvf - emptyDir: medium: Memory name: istio-envoy - name: istio-certs secret: defaultMode: 420 optional: true secretName: istio.default status: conditions: - lastProbeTime: null lastTransitionTime: 2018-08-15T11:41:01Z status: "True" type: Initialized - lastProbeTime: null lastTransitionTime: 2018-08-15T11:44:28Z status: "True" type: Ready - lastProbeTime: null lastTransitionTime: 2018-08-15T11:40:59Z status: "True" type: PodScheduled containerStatuses: - containerID: docker://758e130a4c31a15c1b8bc1e1f72bd7739d5fa1103132861eea9ae1a6ae1f080e image: citizenstig/httpbin:latest imageID: docker-pullable://citizenstig/httpbin#sha256:b81c818ccb8668575eb3771de2f72f8a5530b515365842ad374db76ad8bcf875 lastState: {} name: httpbin ready: true restartCount: 0 state: running: startedAt: 2018-08-15T11:41:01Z - containerID: docker://9c78eac46a99457f628493975f5b0c5bbffa1dac96dab5521d2efe4143219575 image: istio/proxyv2:1.0.0 imageID: docker-pullable://istio/proxyv2#sha256:77915a0b8c88cce11f04caf88c9ee30300d5ba1fe13146ad5ece9abf8826204c lastState: terminated: containerID: docker://52299a80a0fa8949578397357861a9066ab0148ac8771058b83e4c59e422a029 exitCode: 255 finishedAt: 2018-08-15T11:44:27Z reason: Error startedAt: 2018-08-15T11:41:02Z name: istio-proxy ready: true restartCount: 1 state: running: startedAt: 2018-08-15T11:44:28Z hostIP: 10.32.96.27 initContainerStatuses: - containerID: docker://f267bb44b70d2d383ce3f9943ab4e917bb0a42ecfe17fe0ed294bde4d8284c58 image: istio/proxy_init:1.0.0 imageID: docker-pullable://istio/proxy_init#sha256:345c40053b53b7cc70d12fb94379e5aa0befd979a99db80833cde671bd1f9fad lastState: {} name: istio-init ready: true restartCount: 0 state: terminated: containerID: docker://f267bb44b70d2d383ce3f9943ab4e917bb0a42ecfe17fe0ed294bde4d8284c58 exitCode: 0 finishedAt: 2018-08-15T11:41:00Z reason: Completed startedAt: 2018-08-15T11:41:00Z phase: Running podIP: 10.32.19.61 qosClass: Burstable startTime: 2018-08-15T11:40:59Z Here is the example command when I got the error sleep.legacy -> httpbin.foo > kubectl exec $(kubectl get pod -l app=sleep -n legacy -o jsonpath={.items..metadata.name}) -c sleep -n legacy -- curl http://httpbin.foo:8000/ip -s -o /dev/null -w "%{http_code}\n" 000 command terminated with exit code 7 ** Here is the example command when I get success status: sleep.legacy -> httpbin.legacy ** > kubectl exec $(kubectl get pod -l app=sleep -n legacy -o jsonpath={.items..metadata.name}) -csleep -n legacy -- curl http://httpbin.legacy:8000/ip -s -o /dev/null -w "%{http_code}\n" 200 I have followed the instruction to ensure there is no mtls policy defined, etc. > kubectl get policies.authentication.istio.io --all-namespaces No resources found. > kubectl get meshpolicies.authentication.istio.io No resources found. > kubectl get destinationrules.networking.istio.io --all-namespaces -o yaml | grep "host:" host: istio-policy.istio-system.svc.cluster.local host: istio-telemetry.istio-system.svc.cluster.local
NVM, I think I found why. There is configuration being messed up in my part. If you take a look at the statsd address, it is defined with unrecognized hostname istio-statsd-prom-bridge.istio-system.istio-system:9125. I noticed that after looking at the proxy container being restarted/crashed multiple times.
RabbitMQ nodes not able to discover each other and join cluster
I'm new to RabbitMQ and trying to setup a Highly Available Queue using statefulsets. The tutorial I followed is here After deploying the statefulset and service to kubernetes, The nodes are not able to discover each other in the cluster and the pod goes to Status: CrashLoopBackOff. It seems the Peer Discovery is not working as expected and the node is not able to join the cluster. My cluster nodes are rabbit#rabbitmq-0, rabbit#rabbitmq-1 and rabbit#rabbitmq-2 $ kubectl exec -it rabbitmq-0 /bin/sh / # rabbitmqctl status Status of node 'rabbit#rabbitmq-0' Error: unable to connect to node 'rabbit#rabbitmq-0': nodedown DIAGNOSTICS =========== attempted to contact: ['rabbit#rabbitmq-0'] rabbit#rabbitmq-0: * connected to epmd (port 4369) on rabbitmq-0 * epmd reports: node 'rabbit' not running at all no other nodes on rabbitmq-0 * suggestion: start the node current node details: - node name: 'rabbitmq-cli-22#rabbitmq-0' - home dir: /var/lib/rabbitmq - cookie hash: 5X3n5Gy+r4FL+M53FHwv3w== rabbitmq.conf { rabbit, [ { loopback_users, [ ] }, { tcp_listeners, [ 5672 ] }, { ssl_listeners, [ ] }, { hipe_compile, false }, { cluster_nodes, { [ rabbit#rabbitmq-0, rabbit#rabbitmq-1, rabbit#rabbitmq-2], disc } }, {ssl_listeners, [5671]}, {ssl_options, [{cacertfile,"/etc/rabbitmq/ca_certificate.pem"}, {certfile,"/etc/rabbitmq/server_certificate.pem"}, {keyfile,"/etc/rabbitmq/server_key.pem"}, {verify,verify_peer}, {versions, ['tlsv1.2', 'tlsv1.1']} {fail_if_no_peer_cert,false}]} ] }, { rabbitmq_management, [ { listener, [ { port, 15672 }, { ssl, false } ] } ] } ]. $ kubectl get statefulset rabbitmq apiVersion: apps/v1 kind: StatefulSet metadata: labels: app: rabbitmq name: rabbitmq namespace: development resourceVersion: "119265565" selfLink: /apis/apps/v1/namespaces/development/statefulsets/rabbitmq uid: 10c2fabc-cbb3-11e7-8821-00505695519e spec: podManagementPolicy: OrderedReady replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: app: rabbitmq serviceName: rabbitmq template: metadata: creationTimestamp: null labels: app: rabbitmq spec: containers: - env: - name: RABBITMQ_ERLANG_COOKIE valueFrom: secretKeyRef: key: rabbitmq-erlang-cookie name: rabbitmq-erlang-cookie image: rabbitmq:1.0 imagePullPolicy: IfNotPresent lifecycle: postStart: exec: command: - /bin/sh - -c - | if [ -z "$(grep rabbitmq /etc/resolv.conf)" ]; then sed "s/^search \([^ ]\+\)/search rabbitmq.\1 \1/" /etc/resolv.conf > /etc/resolv.conf.new; cat /etc/resolv.conf.new > /etc/resolv.conf; rm /etc/resolv.conf.new; fi; until rabbitmqctl node_health_check; do sleep 1; done; if [[ "$HOSTNAME" != "rabbitmq-0" && -z "$(rabbitmqctl cluster_status | grep rabbitmq-0)" ]]; then rabbitmqctl stop_app; rabbitmqctl join_cluster rabbit#rabbitmq-0; rabbitmqctl start_app; fi; rabbitmqctl set_policy ha-all "." '{"ha-mode":"exactly","ha-params":3,"ha-sync-mode":"automatic"}' name: rabbitmq ports: - containerPort: 5672 protocol: TCP - containerPort: 5671 protocol: TCP - containerPort: 15672 protocol: TCP - containerPort: 25672 protocol: TCP - containerPort: 4369 protocol: TCP resources: limits: cpu: 400m memory: 2Gi requests: cpu: 200m memory: 1Gi terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /var/lib/rabbitmq name: rabbitmq-persistent-data-storage - mountPath: /etc/rabbitmq name: rabbitmq-config dnsPolicy: ClusterFirst restartPolicy: Always schedulerName: default-scheduler securityContext: {} terminationGracePeriodSeconds: 10 volumes: - name: rabbitmq-config secret: defaultMode: 420 secretName: rabbitmq-config updateStrategy: type: OnDelete volumeClaimTemplates: - metadata: creationTimestamp: null name: rabbitmq-persistent-data-storage spec: accessModes: - ReadWriteOnce resources: requests: storage: 100Gi status: phase: Pending status: currentReplicas: 1 currentRevision: rabbitmq-4234207235 observedGeneration: 1 replicas: 1 updateRevision: rabbitmq-4234207235 $ kubectl get service rabbitmq apiVersion: v1 kind: Service metadata: labels: app: rabbitmq name: rabbitmq namespace: develop resourceVersion: "59968950" selfLink: /api/v1/namespaces/develop/services/rabbitmq uid: ced85a60-cbae-11e7-8821-00505695519e spec: clusterIP: None ports: - name: tls-amqp port: 5671 protocol: TCP targetPort: 5671 - name: management port: 15672 protocol: TCP targetPort: 15672 selector: app: rabbitmq sessionAffinity: None type: ClusterIP status: loadBalancer: {} $ kubectl describe pod rabbitmq-0 Name: rabbitmq-0 Namespace: development Node: node9/170.XX.X.Xx Labels: app=rabbitmq controller-revision-hash=rabbitmq-4234207235 Status: Running IP: 10.25.128.XX Controlled By: StatefulSet/rabbitmq Containers: rabbitmq: Container ID: docker://f60b06283d3974382a068ded54782b24de4b6da3203c05772a77c65d76aa2e2f Image: rabbitmq:1.0 Image ID: rabbitmq#sha256:6245a81a1fc0fb Ports: 5672/TCP, 5671/TCP, 15672/TCP, 25672/TCP, 4369/TCP State: Waiting Reason: CrashLoopBackOff Last State: Terminated Reason: Completed Exit Code: 0 Ready: False Restart Count: 104 Limits: cpu: 400m memory: 2Gi Requests: cpu: 200m memory: 1Gi Environment: RABBITMQ_ERLANG_COOKIE: <set to the key 'rabbitmq-erlang-cookie' in secret 'rabbitmq-erlang-cookie'> Optional: false Mounts: /etc/rabbitmq from rabbitmq-config (rw) /var/lib/rabbitmq from rabbitmq-persistent-data-storage (rw) /var/run/secrets/kubernetes.io/serviceaccount from default-token-lqbp6 (ro) Conditions: Type Status Initialized True Ready False PodScheduled True Volumes: rabbitmq-persistent-data-storage: Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace) ClaimName: rabbitmq-persistent-data-storage-rabbitmq-0 ReadOnly: false rabbitmq-config: Type: Secret (a volume populated by a Secret) SecretName: rabbitmq-config Optional: false default-token-lqbp6: Type: Secret (a volume populated by a Secret) SecretName: default-token-lqbp6 Optional: false QoS Class: Burstable Node-Selectors: <none> Tolerations: <none> Events: <none>
This problem is due to failed DNS resolution happening inside the Pod. The pods are not able to contact each other due to no valid DNS records. In order to solve this, please try creating additional service, or edit an existing one to handle DNS resolution for this. Creating an additional service for DNS probe, can be done as follows : kind: Service apiVersion: v1 metadata: namespace: default name: rabbitmq labels: app: rabbitmq type: Service spec: ports: - name: http protocol: TCP port: 15672 targetPort: 15672 - name: amqp protocol: TCP port: 5672 targetPort: 5672 selector: app: rabbitmq type: ClusterIP clusterIP: None Here you mention in the Service spec that it is of type ClusterIP with clusterIP as none. This should help pods resolve the DNS. Cheers!! Rishabh
Openshift/Kubernetes volumes_from
I'm trying to mimic volumes_from using Openshift/Kubernetes. I have container A packaging the app and container B server the packaged app. I cannot use init containers since I'm stuck on kubernetes 1.2. I've tried the postStart lifecycle hook, detailed here: How to mimic '--volumes-from' in Kubernetes But, Openshift/Kubernetes is always complaining that container A is contantly crashing because once it's done packaging, it exits. How do I get Openshift/Kubernetes to stop complaining about container A crashing and just accept that it finished it's job? Or is there another way of having one container build a package for another container to run? Thanks in advance for your time. Update 1: I don't have kubectl, but using oc describe pod myapp-2-prehook: me:~/Projects/myapp (master) $ oc describe pod myapp-2-prehook Name: myapp-2-prehook Namespace: myproject Node: my.host/my.ip Start Time: Tue, 01 Nov 2016 15:30:55 -1000 Labels: openshift.io/deployer-pod-for.name=myapp-2 Status: Failed IP: Controllers: <none> Containers: lifecycle: Container ID: docker://97a5272ebfa56f0c40fdc95094f13da06dba889049f2cc964fe3e89f61bd7792 Image: my.ip:5000/myproject/myapp#sha256:cde5739c5f2bdc8c25b1dd514f698c543cfb6c8b68c3f1afbc7760e11597fde9 Image ID: docker://3be476fec505e5b979bac69d327d4ffb53b3f568e85547c5b66c229948435f44 Port: Command: scripts/build.sh QoS Tier: cpu: BestEffort memory: BestEffort State: Terminated Reason: Error Exit Code: 1 Started: Tue, 01 Nov 2016 15:31:21 -1000 Finished: Tue, 01 Nov 2016 15:31:42 -1000 Ready: False Restart Count: 0 Environment Variables: CUSTOM_VAR1: custom_value1 OPENSHIFT_DEPLOYMENT_NAME: myapp OPENSHIFT_DEPLOYMENT_NAMESPACE: myproject Conditions: Type Status Ready False Volumes: default-token-goe98: Type: Secret (a volume populated by a Secret) SecretName: default-token-goe98 No events. Output of oc get pod assessor-2-prehook -o yaml: apiVersion: v1 kind: Pod metadata: annotations: openshift.io/deployment.name: myapp-2 openshift.io/scc: restricted creationTimestamp: 2016-11-02T01:30:55Z labels: openshift.io/deployer-pod-for.name: myapp-2 name: myapp-2-prehook namespace: myproject resourceVersion: "21512896" selfLink: /api/v1/namespaces/myproject/pods/myapp-2-prehook uid: ffcb7766-a09b-11e6-9053-005056a65cf8 spec: activeDeadlineSeconds: 21600 containers: - command: - scripts/build.sh env: - name: CUSTOM_VAR1 value: custom_value1 - name: OPENSHIFT_DEPLOYMENT_NAME value: myapp-2 - name: OPENSHIFT_DEPLOYMENT_NAMESPACE value: myproject image: my.ip:5000/myproject/myapp#sha256:cde5739c5f2bdc8c25b1dd514f698c543cfb6c8b68c3f1afbc7760e11597fde9 imagePullPolicy: IfNotPresent name: lifecycle resources: {} securityContext: privileged: false seLinuxOptions: level: s0:c21,c0 terminationMessagePath: /dev/termination-log volumeMounts: - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: default-token-goe98 readOnly: true dnsPolicy: ClusterFirst host: my.host imagePullSecrets: - name: default-dockercfg-srrog nodeName: my.host restartPolicy: Never securityContext: seLinuxOptions: level: s0:c21,c0 serviceAccount: default serviceAccountName: default terminationGracePeriodSeconds: 30 volumes: - name: default-token-goe98 secret: secretName: default-token-goe98 status: conditions: - lastProbeTime: null lastTransitionTime: 2016-11-02T01:31:49Z message: 'containers with unready status: [lifecycle]' reason: ContainersNotReady status: "False" type: Ready containerStatuses: - containerID: docker://97a5272ebfa56f0c40fdc95094f13da06dba889049f2cc964fe3e89f61bd7792 image: my.ip:5000/myproject/myapp#sha256:cde5739c5f2bdc8c25b1dd514f698c543cfb6c8b68c3f1afbc7760e11597fde9 imageID: docker://3be476fec505e5b979bac69d327d4ffb53b3f568e85547c5b66c229948435f44 lastState: {} name: lifecycle ready: false restartCount: 0 state: terminated: containerID: docker://97a5272ebfa56f0c40fdc95094f13da06dba889049f2cc964fe3e89f61bd7792 exitCode: 1 finishedAt: 2016-11-02T01:31:42Z reason: Error startedAt: 2016-11-02T01:31:21Z hostIP: 128.49.90.62 phase: Failed startTime: 2016-11-02T01:30:55Z