Sometimes missing desiredContainers - kubernetes

My Zalenium is deployed in Kubernetes. I have set option desiredContainers = 2 and it's working. But sometimes desired containers are not available. Tests are working properly, even when desired containers not available. After "restart" containers appears, but I have no idea why they sometimes dissapears. Does anyone have idea what's going on?
kind: Deployment
apiVersion: extensions/v1beta1
metadata:
name: zalenium
namespace: zalenium-omdc
selfLink: /apis/extensions/v1beta1/namespaces/zalenium-omdc/deployments/zalenium
uid: cbafe254-3e28-4889-a09e-ccfa500ff628
resourceVersion: '25201258'
generation: 24
creationTimestamp: '2019-09-17T13:24:52Z'
labels:
app: zalenium
instance: zalenium
annotations:
deployment.kubernetes.io/revision: '24'
kubectl.kubernetes.io/last-applied-configuration: >
{"apiVersion":"apps/v1","kind":"Deployment","metadata":{"annotations":{},"labels":{"app":"zalenium","instance":"zalenium"},"name":"zalenium","namespace":"zalenium-omdc"},"spec":{"replicas":1,"selector":{"matchLabels":{"instance":"zalenium"}},"template":{"metadata":{"labels":{"app":"zalenium","instance":"zalenium"}},"spec":{"containers":[{"args":["start"],"env":[{"name":"ZALENIUM_KUBERNETES_CPU_REQUEST","value":"250m"},{"name":"ZALENIUM_KUBERNETES_CPU_LIMIT","value":"1000m"},{"name":"ZALENIUM_KUBERNETES_MEMORY_REQUEST","value":"500Mi"},{"name":"ZALENIUM_KUBERNETES_MEMORY_LIMIT","value":"2Gi"},{"name":"DESIRED_CONTAINERS","value":"2"},{"name":"MAX_DOCKER_SELENIUM_CONTAINERS","value":"16"},{"name":"SELENIUM_IMAGE_NAME","value":"elgalu/selenium"},{"name":"VIDEO_RECORDING_ENABLED","value":"true"},{"name":"SCREEN_WIDTH","value":"1440"},{"name":"SCREEN_HEIGHT","value":"900"},{"name":"MAX_TEST_SESSIONS","value":"1"},{"name":"NEW_SESSION_WAIT_TIMEOUT","value":"1800000"},{"name":"DEBUG_ENABLED","value":"false"},{"name":"SEND_ANONYMOUS_USAGE_INFO","value":"true"},{"name":"TZ","value":"UTC"},{"name":"KEEP_ONLY_FAILED_TESTS","value":"false"},{"name":"RETENTION_PERIOD","value":"3"}],"image":"dosel/zalenium:3","imagePullPolicy":"IfNotPresent","livenessProbe":{"httpGet":{"path":"/status","port":4444},"initialDelaySeconds":90,"periodSeconds":5,"timeoutSeconds":1},"name":"zalenium","ports":[{"containerPort":4444,"protocol":"TCP"}],"readinessProbe":{"httpGet":{"path":"/status","port":4444},"timeoutSeconds":1},"resources":{"requests":{"cpu":"500m","memory":"500Mi"}},"volumeMounts":[{"mountPath":"/home/seluser/videos","name":"zalenium-videos"},{"mountPath":"/tmp/mounted","name":"zalenium-data"}]}],"serviceAccountName":"zalenium","volumes":[{"emptyDir":{},"name":"zalenium-videos"},{"emptyDir":{},"name":"zalenium-data"}]}}}}
spec:
replicas: 1
selector:
matchLabels:
instance: zalenium
template:
metadata:
creationTimestamp: null
labels:
app: zalenium
instance: zalenium
spec:
volumes:
- name: zalenium-videos
emptyDir: {}
- name: zalenium-data
emptyDir: {}
containers:
- name: zalenium
image: 'dosel/zalenium:3'
args:
- start
ports:
- containerPort: 4444
protocol: TCP
env:
- name: ZALENIUM_KUBERNETES_CPU_REQUEST
value: 250m
- name: ZALENIUM_KUBERNETES_CPU_LIMIT
value: 1000m
- name: ZALENIUM_KUBERNETES_MEMORY_REQUEST
value: 500Mi
- name: ZALENIUM_KUBERNETES_MEMORY_LIMIT
value: 2Gi
- name: DESIRED_CONTAINERS
value: '2'
- name: MAX_DOCKER_SELENIUM_CONTAINERS
value: '16'
- name: SELENIUM_IMAGE_NAME
value: elgalu/selenium
- name: VIDEO_RECORDING_ENABLED
value: 'false'
- name: SCREEN_WIDTH
value: '1920'
- name: SCREEN_HEIGHT
value: '1080'
- name: MAX_TEST_SESSIONS
value: '1'
- name: NEW_SESSION_WAIT_TIMEOUT
value: '7200000'
- name: DEBUG_ENABLED
value: 'false'
- name: SEND_ANONYMOUS_USAGE_INFO
value: 'true'
- name: TZ
value: UTC
- name: KEEP_ONLY_FAILED_TESTS
value: 'false'
- name: RETENTION_PERIOD
value: '3'
- name: SEL_BROWSER_TIMEOUT_SECS
value: '7200'
- name: BROWSER_STACK_WAIT_TIMEOUT
value: 120m
resources:
limits:
memory: 1Gi
requests:
cpu: 500m
memory: 500Mi
volumeMounts:
- name: zalenium-videos
mountPath: /home/seluser/videos
- name: zalenium-data
mountPath: /tmp/mounted
livenessProbe:
httpGet:
path: /status
port: 4444
scheme: HTTP
initialDelaySeconds: 90
timeoutSeconds: 1
periodSeconds: 5
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /status
port: 4444
scheme: HTTP
timeoutSeconds: 1
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
nodeSelector:
dedicated: omdc
serviceAccountName: zalenium
serviceAccount: zalenium
securityContext: {}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: dedicated
operator: In
values:
- omdc
schedulerName: default-scheduler
tolerations:
- key: dedicated
operator: Equal
value: omdc
effect: NoSchedule
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
status:
observedGeneration: 24
replicas: 1
updatedReplicas: 1
readyReplicas: 1
availableReplicas: 1
conditions:
- type: Available
status: 'True'
lastUpdateTime: '2019-10-22T06:57:52Z'
lastTransitionTime: '2019-10-22T06:57:52Z'
reason: MinimumReplicasAvailable
message: Deployment has minimum availability.
- type: Progressing
status: 'True'
lastUpdateTime: '2019-10-31T09:14:01Z'
lastTransitionTime: '2019-09-17T13:24:52Z'
reason: NewReplicaSetAvailable
message: ReplicaSet "zalenium-6df85c7f49" has successfully progressed.

Related

Grafana alert provisioning issue

I'd like to be able to provision alerts, and try to follow this instructions, but no luck!
OS Grafana version: 9.2.0 Running in Kubernetes
Steps that I take:
Created a new alert rule from UI.
Extract alert rule from API:
curl -k https://<my-grafana-url>/api/v1/provisioning/alert-rules/-4pMuQFVk -u admin:<my-admin-password>
It returns the following:
---
id: 18
uid: "-4pMuQFVk"
orgID: 1
folderUID: 3i72aQKVk
ruleGroup: cpu_alert_group
title: my_cpu_alert
condition: B
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 600
to: 0
datasourceUid: _SaubQF4k
model:
editorMode: code
expr: system_cpu_usage
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: A
- refId: B
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: "-100"
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: classic_conditions
updated: '2022-12-07T20:01:47Z'
noDataState: NoData
execErrState: Error
for: 5m
Deleted the alert rule from UI.
Made a configmap from above alert-rule, as such:
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
data:
alerting.yaml: |-
apiVersion: 1
groups:
- id: 18
uid: "-4pMuQFVk"
orgID: 1
folderUID: 3i72aQKVk
ruleGroup: cpu_alert_group
title: my_cpu_alert
condition: B
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 600
to: 0
datasourceUid: _SaubQF4k
model:
editorMode: code
expr: system_cpu_usage
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: A
- refId: B
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: "-100"
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: classic_conditions
updated: '2022-12-07T20:01:47Z'
noDataState: NoData
execErrState: Error
for: 5m
I mount above configmap in grafana container (in /etc/grafna/provisioning/alerting).
The full manifest of the deployment is as follow:
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "2"
meta.helm.sh/release-name: grafana
meta.helm.sh/release-namespace: monitoring
creationTimestamp: "2022-12-08T18:31:30Z"
generation: 4
labels:
app.kubernetes.io/instance: grafana
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: grafana
app.kubernetes.io/version: 9.3.0
helm.sh/chart: grafana-6.46.1
name: grafana
namespace: monitoring
resourceVersion: "648617"
uid: dc06b802-5281-4f31-a2b2-fef3cf53a70b
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/instance: grafana
app.kubernetes.io/name: grafana
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
annotations:
checksum/config: 98cac51656714db48a116d3109994ee48c401b138bc8459540e1a497f994d197
checksum/dashboards-json-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
checksum/sc-dashboard-provider-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
checksum/secret: 203f0e4d883461bdd41fe68515fc47f679722dc2fdda49b584209d1d288a5f07
creationTimestamp: null
labels:
app.kubernetes.io/instance: grafana
app.kubernetes.io/name: grafana
spec:
automountServiceAccountToken: true
containers:
- env:
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
key: admin-user
name: grafana
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
key: admin-password
name: grafana
- name: GF_PATHS_DATA
value: /var/lib/grafana/
- name: GF_PATHS_LOGS
value: /var/log/grafana
- name: GF_PATHS_PLUGINS
value: /var/lib/grafana/plugins
- name: GF_PATHS_PROVISIONING
value: /etc/grafana/provisioning
image: grafana/grafana:9.3.0
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
name: grafana
ports:
- containerPort: 3000
name: grafana
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/grafana/grafana.ini
name: config
subPath: grafana.ini
- mountPath: /etc/grafana/provisioning/alerting
name: grafana-alerting
- mountPath: /var/lib/grafana
name: storage
dnsPolicy: ClusterFirst
enableServiceLinks: true
restartPolicy: Always
schedulerName: default-scheduler
securityContext:
fsGroup: 472
runAsGroup: 472
runAsUser: 472
serviceAccount: grafana
serviceAccountName: grafana
terminationGracePeriodSeconds: 30
volumes:
- configMap:
defaultMode: 420
name: grafana-alerting
name: grafana-alerting
- configMap:
defaultMode: 420
name: grafana
name: config
- emptyDir: {}
name: storage
However, grafana fail to start with this error:
Failed to start grafana. error: failure to map file alerting.yaml: failure parsing rules: rule group has no name set
I fixed above error by adding Group Name, but similar errors about other missing elements kept showing up again and again (to the point that I stopped chasing, as I couldn't figure out what exactly the correct schema is). Digging in, it looks like the format/schema returned from API in step 2, is different than the schema that's pointed out in the documentation.
Why is the schema of the alert-rule returned from API different? Am I supposed to convert it, and if so how? Otherwise, what am I doing wrong?
Edit: Replaced Statefulset with deployment, since I was able to reproduce this in a normal/minimal Grafana deployment too.

How to deploy Apache Nifi Statefulset in cluster mode with external zookeeper?

I tried to deploy Apache Nifi (statefulset) in Kubernetes, in cluster mode. Firstyly I am trying only one node but I don't know where I am wrong in the yaml and how to access the Nifi UI when I deploy Nifi statefulset. I use external zookeeper.
Not sure if I have to create service for each node of the cluster.
On K8S dashboard nifi pod is working well.
I know that Stateful sets allow the creation of stable network identities of pods by creating what is known as a headless service. But how can I access the UI next ?
Nifi StatefulSet yaml file:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: nifi
labels:
name: nifi
app: nifi
annotations:
app.kubernetes.io/name: nifi
app.kubernetes.io/part-of: nifi
spec:
serviceName: nifi
# replicas: 2
revisionHistoryLimit: 1
# strategy:
# type: Recreate
selector:
matchLabels:
app: nifi
template:
metadata:
labels:
app: nifi
spec:
automountServiceAccountToken: false
enableServiceLinks: false
restartPolicy: Always
securityContext:
runAsGroup: 1000
runAsUser: 1000
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
containers:
- name: nifi
image: XXX
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: nifi
- containerPort: 8082
name: cluster
env:
- name: "NIFI_SENSITIVE_PROPS_KEY"
value: "nificlusterbulot"
- name: NIFI_WEB_HTTP_HOST
value: "nifi-0.NAMESPACE_NAME.svc.cluster.local"
- name: NIFI_WEB_HTTP_PORT
value: "8080"
- name: NIFI_ANALYTICS_PREDICT_ENABLED
value: "true"
- name: NIFI_ELECTION_MAX_CANDIDATES
value: "2"
- name: NIFI_ELECTION_MAX_WAIT
value: "1 min"
- name: NIFI_CLUSTER_IS_NODE
value: "true"
- name: NIFI_JVM_HEAP_INIT
value: "3g"
- name: NIFI_JVM_HEAP_MAX
value: "4g"
- name: NIFI_CLUSTER_NODE_CONNECTION_TIMEOUT
value: "2 min"
- name: NIFI_CLUSTER_PROTOCOL_CONNECTION_HANDSHAKE_TIMEOUT
value: "2 min"
- name: NIFI_CLUSTER_NODE_PROTOCOL_MAX_THREADS
value: "15"
- name: NIFI_CLUSTER_NODE_PROTOCOL_PORT
value: "8082"
- name: NIFI_CLUSTER_NODE_READ_TIMEOUT
value: "15"
- name: NIFI_ZK_CONNECT_STRING
value: "zookeeper:2181"
- name: NIFI_CLUSTER_NODE_ADDRESS
value: "nifi-0.nifi.NAMESPACE_NAME.cluster.local"
# valueFrom:
# fieldRef:
# fieldPath: status.podIP
# - name: HOSTNAME
# valueFrom:
# fieldRef:
# fieldPath: status.podIP
livenessProbe:
exec:
command:
- pgrep
- java
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
readinessProbe:
exec:
command:
- pgrep
- java
initialDelaySeconds: 180
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
resources:
requests:
cpu: 400m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
volumes:
- name: pv-01
persistentVolumeClaim:
claimName: pv-claim
Zookeeper yaml file:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: zookeeper
namespace: namespace_name
labels:
name : zookeeper
app : zookeeper
# annotations:
# app.kubernetes.io/name: zookeeper
# app.kubernetes.io/part-of: nifi
spec:
revisionHistoryLimit: 1
serviceName: zookeeper
selector:
matchLabels:
app: zookeeper
template:
metadata:
labels:
app: zookeeper
spec:
automountServiceAccountToken: false
enableServiceLinks: false
restartPolicy: Always
securityContext:
runAsGroup: 1000
runAsUser: 1000
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
containers:
- name: zookeeper
image: XXX
imagePullPolicy: IfNotPresent
ports:
- containerPort: 2181
name: zk
- containerPort: 2182
name: zc
# - containerPort: 8083
# name: web
- containerPort: 5111
name: cmd
env:
- name: ALLOW_ANONYMOUS_LOGIN
value: "yes"
- name: ZOO_ADMINSERVER_ENABLED
value: "true"
- name: ZOO_AUTOPURGE_PURGEINTERVAL
value: "2"
- name: ZOO_AUTOPURGE_SNAPRETAINCOUNT
value: "10"
- name: ZOO_INIT_LIMIT
value: "10"
- name: ZOO_STANDALONE_ENABLED
value: "true"
- name: ZOO_SYNC_LIMIT
value: "6"
- name: ZOO_TICK_TIME
value: "4000"
livenessProbe:
exec:
command:
- which
- java
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
readinessProbe:
tcpSocket:
port: 2181
initialDelaySeconds: 120
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 300m
memory: 2Gi
securityContext:
allowPrivilegeEscalation: false
privileged: false
runAsGroup: 1000
runAsUser: 1000
- name: pv-01
persistentVolumeClaim:
claimName: pv-claim

AKS - Pods created by HPA trigger are getting terminated immediately after they are created

When we had a look into the events in AKS, we observed the below errors for all the pods which were created and terminated:
2m47s Warning FailedMount pod/app-fd6c6b8d9-ssr2t Unable to attach or mount volumes: unmounted volumes=[log-volume config-volume log4j2 secrets-app-inline kube-api-access-z49xc], unattached volumes=[log-volume config-volume log4j2 secrets-app-inline kube-api-access-z49xc]: timed out waiting for the condition
We already have 2 replicas running for the application so don't think that the error will be due to AccessModes of volumes.
Below is the HPA config:
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
name: app-cpu-hpa
namespace: namespace-dev
spec:
maxReplicas: 5
minReplicas: 2
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: app
metrics:
- type: Resource
resource:
name: cpu
targetAverageValue: 500m
Below is the deployment config:
apiVersion: apps/v1
kind: Deployment
metadata:
name: app
labels:
app: app
group: app
obs: appd
spec:
replicas: 2
selector:
matchLabels:
app: app
template:
metadata:
annotations:
container.apparmor.security.beta.kubernetes.io/app: runtime/default
labels:
app: app
group: app
obs: appd
spec:
containers:
- name: app
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 2000
imagePullPolicy: {{ .Values.image.pullPolicy }}
resources:
limits:
cpu: {{ .Values.app.limits.cpu }}
memory: {{ .Values.app.limits.memory }}
requests:
cpu: {{ .Values.app.requests.cpu }}
memory: {{ .Values.app.requests.memory }}
env:
- name: LOG_DIR_PATH
value: /opt/apps/
volumeMounts:
- name: log-volume
mountPath: /opt/apps/app/logs
- name: config-volume
mountPath: /script/start.sh
subPath: start.sh
- name: log4j2
mountPath: /opt/appdynamics-java/ver21.9.0.33073/conf/logging/log4j2.xml
subPath: log4j2.xml
- name: secrets-app-inline
mountPath: "/mnt/secrets-app"
readOnly: true
readinessProbe:
failureThreshold: 3
httpGet:
path: /actuator/info
port: {{ .Values.metrics.port }}
scheme: "HTTP"
httpHeaders:
- name: Authorization
value: "Basic XXX50aXXXXXX=="
- name: cache-control
value: "no-cache"
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
initialDelaySeconds: 60
livenessProbe:
httpGet:
path: /actuator/info
port: {{ .Values.metrics.port }}
scheme: "HTTP"
httpHeaders:
- name: Authorization
value: "Basic XXX50aXXXXXX=="
- name: cache-control
value: "no-cache"
initialDelaySeconds: 300
periodSeconds: 5
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
volumes:
- name: log-volume
persistentVolumeClaim:
claimName: {{ .Values.apppvc.name }}
- name: config-volume
configMap:
name: {{ .Values.configmap.name }}-configmap
defaultMode: 0755
- name: secrets-app-inline
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: "app-kv-secret"
nodePublishSecretRef:
name: secrets-app-creds
- name: log4j2
configMap:
name: log4j2
defaultMode: 0755
restartPolicy: Always
imagePullSecrets:
- name: {{ .Values.imagePullSecrets }}
Can someone please let me know where the config might be going wrong?

cp: cannot stat '/opt/flink/opt/flink-metrics-prometheus-*.jar': No such file or directory in apache flink

I am upgrade apache flink 1.10 to apache flink 1.11 in kubernetes, but the jobmanager kubernetes pod log shows:
cp: cannot stat '/opt/flink/opt/flink-metrics-prometheus-*.jar': No such file or directory
this is my jobmanager pod yaml:
kind: Deployment
apiVersion: apps/v1
metadata:
name: report-flink-jobmanager
namespace: middleware
selfLink: /apis/apps/v1/namespaces/middleware/deployments/report-flink-jobmanager
uid: b7bd8f0d-cddb-44e7-8bbe-b96e68dbfbcd
resourceVersion: '13655071'
generation: 44
creationTimestamp: '2020-06-08T02:11:33Z'
labels:
app.kubernetes.io/instance: report-flink
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: flink
app.kubernetes.io/version: 1.10.0
component: jobmanager
helm.sh/chart: flink-0.1.15
annotations:
deployment.kubernetes.io/revision: '6'
meta.helm.sh/release-name: report-flink
meta.helm.sh/release-namespace: middleware
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/instance: report-flink
app.kubernetes.io/name: flink
component: jobmanager
template:
metadata:
creationTimestamp: null
labels:
app.kubernetes.io/instance: report-flink
app.kubernetes.io/name: flink
component: jobmanager
spec:
volumes:
- name: flink-config-volume
configMap:
name: report-flink-config
items:
- key: flink-conf.yaml
path: flink-conf.yaml.tpl
- key: log4j.properties
path: log4j.properties
- key: security.properties
path: security.properties
defaultMode: 420
- name: flink-pro-persistent-storage
persistentVolumeClaim:
claimName: flink-pv-claim
containers:
- name: jobmanager
image: 'flink:1.11'
command:
- /bin/bash
- '-c'
- >-
cp /opt/flink/opt/flink-metrics-prometheus-*.jar
/opt/flink/opt/flink-s3-fs-presto-*.jar /opt/flink/lib/ && wget
https://repo1.maven.org/maven2/com/github/oshi/oshi-core/3.4.0/oshi-core-3.4.0.jar
-O /opt/flink/lib/oshi-core-3.4.0.jar && wget
https://repo1.maven.org/maven2/net/java/dev/jna/jna/5.4.0/jna-5.4.0.jar
-O /opt/flink/lib/jna-5.4.0.jar && wget
https://repo1.maven.org/maven2/net/java/dev/jna/jna-platform/5.4.0/jna-platform-5.4.0.jar
-O /opt/flink/lib/jna-platform-5.4.0.jar && cp
$FLINK_HOME/conf/flink-conf.yaml.tpl
$FLINK_HOME/conf/flink-conf.yaml && $FLINK_HOME/bin/jobmanager.sh
start; while :; do if [[ -f $(find log -name '*jobmanager*.log'
-print -quit) ]]; then tail -f -n +1 log/*jobmanager*.log; fi;
done
workingDir: /opt/flink
ports:
- name: blob
containerPort: 6124
protocol: TCP
- name: rpc
containerPort: 6123
protocol: TCP
- name: ui
containerPort: 8081
protocol: TCP
- name: metrics
containerPort: 9999
protocol: TCP
env:
- name: JVM_ARGS
value: '-Djava.security.properties=/opt/flink/conf/security.properties'
- name: FLINK_POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: APOLLO_META
valueFrom:
configMapKeyRef:
name: pro-config
key: apollo.meta
- name: ENV
valueFrom:
configMapKeyRef:
name: pro-config
key: env
resources: {}
volumeMounts:
- name: flink-config-volume
mountPath: /opt/flink/conf/flink-conf.yaml.tpl
subPath: flink-conf.yaml.tpl
- name: flink-config-volume
mountPath: /opt/flink/conf/log4j.properties
subPath: log4j.properties
- name: flink-config-volume
mountPath: /opt/flink/conf/security.properties
subPath: security.properties
- name: flink-pro-persistent-storage
mountPath: /opt/flink/data/
livenessProbe:
tcpSocket:
port: 6124
initialDelaySeconds: 10
timeoutSeconds: 1
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
readinessProbe:
tcpSocket:
port: 6123
initialDelaySeconds: 20
timeoutSeconds: 1
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
serviceAccountName: jobmanager
serviceAccount: jobmanager
securityContext: {}
schedulerName: default-scheduler
strategy:
type: Recreate
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
status:
observedGeneration: 44
replicas: 1
updatedReplicas: 1
unavailableReplicas: 1
conditions:
- type: Available
status: 'False'
lastUpdateTime: '2020-08-19T06:26:56Z'
lastTransitionTime: '2020-08-19T06:26:56Z'
reason: MinimumReplicasUnavailable
message: Deployment does not have minimum availability.
- type: Progressing
status: 'False'
lastUpdateTime: '2020-08-19T06:42:56Z'
lastTransitionTime: '2020-08-19T06:42:56Z'
reason: ProgressDeadlineExceeded
message: >-
ReplicaSet "report-flink-jobmanager-7b8b9bd6bb" has timed out
progressing.
should I remove the not exists jar file? how to fix this?

Update Kafka in Kubernetes causes downtime

I'm running a 4 brokers Kafka cluster in Kubernetes. The replication factor is 3 and ISR is 2.
In addition, there's a producer service (running Spring stream) generating messages and a consumer service reading from the topic. Now I tried to update the Kafka cluster with a rolling update, hoping for no downtime, but during the update, the producer's log was filled with this error:
org.apache.kafka.common.errors.NotLeaderForPartitionException: This server is not the leader for that topic-partition.
According to my calculation, when 1 broker is down there shouldn't be a problem because the min ISR is 2. However, it seems like the producer service is unaware of the rolling update and keep sending messages to the same broker...
Any ideas how to solve it?
This is my kafka.yaml
apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
name: kafka
namespace: default
labels:
app: kafka
spec:
serviceName: kafka
replicas: 4
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: kafka
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9308"
spec:
nodeSelector:
middleware.node: "true"
imagePullSecrets:
- name: nexus-registry
terminationGracePeriodSeconds: 300
containers:
- name: kafka
image: kafka:2.12-2.1.0
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: 3000m
memory: 1800Mi
requests:
cpu: 2000m
memory: 1800Mi
env:
# Replication
- name: KAFKA_DEFAULT_REPLICATION_FACTOR
value: "3"
- name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR
value: "3"
- name: KAFKA_MIN_INSYNC_REPLICAS
value: "2"
# Protocol Version
- name: KAFKA_INTER_BROKER_PROTOCOL_VERSION
value: "2.1"
- name: KAFKA_LOG_MESSAGE_FORMAT_VERSION
value: "2.1"
- name: ENABLE_AUTO_EXTEND
value: "true"
- name: KAFKA_DELETE_TOPIC_ENABLE
value: "true"
- name: KAFKA_RESERVED_BROKER_MAX_ID
value: "999999999"
- name: KAFKA_AUTO_CREATE_TOPICS_ENABLE
value: "true"
- name: KAFKA_PORT
value: "9092"
- name: KAFKA_ADVERTISED_PORT
value: "9092"
- name: KAFKA_NUM_RECOVERY_THREADS_PER_DATA_DIR
value: "10"
- name: KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR
value: "3"
- name: KAFKA_LOG_RETENTION_BYTES
value: "1800000000000"
- name: KAFKA_ADVERTISED_HOST_NAME
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: KAFKA_OFFSETS_RETENTION_MINUTES
value: "10080"
- name: KAFKA_ZOOKEEPER_CONNECT
valueFrom:
configMapKeyRef:
name: zk-config
key: zk.endpoints
- name: KAFKA_LOG_DIRS
value: /kafka/kafka-logs
ports:
- name: kafka
containerPort: 9092
- name: prometheus
containerPort: 7071
volumeMounts:
- name: data
mountPath: /kafka
readinessProbe:
tcpSocket:
port: 9092
timeoutSeconds: 1
failureThreshold: 12
initialDelaySeconds: 10
periodSeconds: 30
successThreshold: 1
- name: kafka-exporter
image: danielqsj/kafka-exporter:latest
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 500Mi
ports:
- containerPort: 9308
volumeClaimTemplates:
- metadata:
name: data
labels:
app: kafka
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2000Gi