DCOS doesn't allocate resources - marathon

I'm running simple DCOS cluster for testing using one master and two private nodes. First day I installed Kafka and did some tests, everything was working correctly.
However after restarting the cluster, Kafka had failed state, after restarting it was heathly again but it had no brokers. After a while I discovered that it's because second node was completely unused.
When first node has 100% CPU usage, the rest of tasks just wait instead of running on the second node and they have Waiting status in Marathon. That second node always has 0% CPU and 0% RAM allocation without any changes. All tasks run only on first node.
That second node is in healthy state and it shows correctly, I can ssh from master to it without any problems. Restarting doesn't work.
How can I fix this? I don't know where to search.
#edit:
kafka json:
{
"id": "/kafka",
"cmd": "export LD_LIBRARY_PATH=$MESOS_SANDBOX/libmesos-bundle/lib:$LD_LIBRARY_PATH && export MESOS_NATIVE_JAVA_LIBRARY=$(ls $MESOS_SANDBOX/libmesos-bundle/lib/libmesos-*.so) && export PATH=$(ls -d $MESOS_SANDBOX/jre*/bin):$PATH && ./scheduler/bin/kafka-scheduler server ./scheduler/conf/scheduler.yml",
"cpus": 1,
"mem": 3072,
"disk": 0,
"instances": 1,
"acceptedResourceRoles": [
"*"
],
"env": {
"KAFKA_OVERRIDE_LEADER_IMBALANCE_CHECK_INTERVAL_SECONDS": "300",
"KAFKA_OVERRIDE_OFFSET_METADATA_MAX_BYTES": "4096",
"KAFKA_OVERRIDE_PRODUCER_PURGATORY_PURGE_INTERVAL_REQUESTS": "1000",
"KAFKA_OVERRIDE_ZOOKEEPER_SESSION_TIMEOUT_MS": "6000",
"BROKER_STATSD_PORT": "0",
"KAFKA_OVERRIDE_LEADER_IMBALANCE_PER_BROKER_PERCENTAGE": "10",
"KAFKA_OVERRIDE_CONTROLLED_SHUTDOWN_MAX_RETRIES": "3",
"KAFKA_OVERRIDE_LOG_CLEANER_DEDUPE_BUFFER_SIZE": "134217728",
"LD_LIBRARY_PATH": "/opt/mesosphere/lib",
"KAFKA_OVERRIDE_CONTROLLER_SOCKET_TIMEOUT_MS": "30000",
"BROKER_JMX_REMOTE_AUTH": "false",
"KAFKA_OVERRIDE_OFFSETS_LOAD_BUFFER_SIZE": "5242880",
"BROKER_MEM": "2304",
"KAFKA_OVERRIDE_QUEUED_MAX_REQUESTS": "500",
"KAFKA_OVERRIDE_LOG_CLEANER_IO_BUFFER_LOAD_FACTOR": "0.9",
"KAFKA_OVERRIDE_OFFSETS_TOPIC_REPLICATION_FACTOR": "3",
"BROKER_COUNT": "1",
"KAFKA_OVERRIDE_QUOTA_CONSUMER_DEFAULT": "9223372036854775807",
"KAFKA_OVERRIDE_INTER_BROKER_PROTOCOL_VERSION": "0.10.0.0",
"BROKER_HEALTH_CHECK_MAX_FAILURES": "3",
"KAFKA_OVERRIDE_LOG_CLEANER_DELETE_RETENTION_MS": "86400000",
"KAFKA_OVERRIDE_OFFSETS_TOPIC_SEGMENT_BYTES": "104857600",
"BROKER_CPUS": "1",
"KAFKA_OVERRIDE_LOG_FLUSH_OFFSET_CHECKPOINT_INTERVAL_MS": "60000",
"KAFKA_OVERRIDE_OFFSETS_COMMIT_TIMEOUT_MS": "5000",
"JAVA_HOME": "./jre1.8.0_121",
"KAFKA_OVERRIDE_AUTO_CREATE_TOPICS_ENABLE": "true",
"KAFKA_OVERRIDE_LOG_CLEANER_THREADS": "1",
"JAVA_URI": "https://downloads.mesosphere.com/java/jre-8u121-linux-x64.tar.gz",
"KAFKA_OVERRIDE_DELETE_TOPIC_ENABLE": "false",
"KAFKA_OVERRIDE_MIN_INSYNC_REPLICAS": "1",
"PHASE_STRATEGY": "INSTALL",
"ENABLE_BROKER_HEALTH_CHECK": "true",
"BROKER_JMX_REMOTE_SSL": "false",
"KAFKA_OVERRIDE_QUOTA_PRODUCER_DEFAULT": "9223372036854775807",
"KAFKA_OVERRIDE_GROUP_MIN_SESSION_TIMEOUT_MS": "6000",
"KAFKA_OVERRIDE_SOCKET_RECEIVE_BUFFER_BYTES": "102400",
"KAFKA_OVERRIDE_COMPRESSION_TYPE": "producer",
"BROKER_HEALTH_CHECK_DELAY_SEC": "15",
"BROKER_JMX_REMOTE_SSL_NEED_CLIENT_AUTH": "false",
"KAFKA_ADVERTISE_HOST_IP": "true",
"KAFKA_OVERRIDE_REPLICA_SOCKET_TIMEOUT_MS": "30000",
"KAFKA_OVERRIDE_OFFSETS_RETENTION_CHECK_INTERVAL_MS": "600000",
"KAFKA_OVERRIDE_SOCKET_REQUEST_MAX_BYTES": "104857600",
"BROKER_HEALTH_CHECK_GRACE_SEC": "10",
"BROKER_PORT": "9100",
"KAFKA_OVERRIDE_OFFSETS_TOPIC_COMPRESSION_CODEC": "0",
"BROKER_JMX_ENABLE": "false",
"KAFKA_OVERRIDE_LOG_CLEANER_ENABLE": "true",
"KAFKA_OVERRIDE_DEFAULT_REPLICATION_FACTOR": "1",
"KAFKA_OVERRIDE_LOG_ROLL_JITTER_HOURS": "0",
"KAFKA_OVERRIDE_LOG_SEGMENT_DELETE_DELAY_MS": "60000",
"KAFKA_OVERRIDE_SOCKET_SEND_BUFFER_BYTES": "102400",
"KAFKA_OVERRIDE_BACKGROUND_THREADS": "10",
"KAFKA_OVERRIDE_LOG_CLEANER_IO_BUFFER_SIZE": "524288",
"KAFKA_OVERRIDE_NUM_REPLICA_FETCHERS": "1",
"BROKER_JMX_REMOTE_PORT": "9999",
"KAFKA_OVERRIDE_METRICS_NUM_SAMPLES": "2",
"OVERRIDER_URI": "https://downloads.mesosphere.com/kafka/assets/1.1.19-0.10.1.0/overrider.zip",
"BROKER_JMX_REMOTE_ENABLE": "false",
"KAFKA_OVERRIDE_AUTO_LEADER_REBALANCE_ENABLE": "true",
"KAFKA_OVERRIDE_UNCLEAN_LEADER_ELECTION_ENABLE": "true",
"KAFKA_OVERRIDE_NUM_RECOVERY_THREADS_PER_DATA_DIR": "1",
"BROKER_HEAP_MB": "2048",
"KAFKA_OVERRIDE_OFFSETS_TOPIC_NUM_PARTITIONS": "50",
"USER": "root",
"PLACEMENT_STRATEGY": "NODE",
"KAFKA_OVERRIDE_OFFSETS_RETENTION_MINUTES": "1440",
"KAFKA_OVERRIDE_LOG_FLUSH_SCHEDULER_INTERVAL_MS": "9223372036854775807",
"KAFKA_OVERRIDE_LOG_CLEANER_IO_MAX_BYTES_PER_SECOND": "1.7976931348623157E308",
"KAFKA_OVERRIDE_OFFSETS_COMMIT_REQUIRED_ACKS": "-1",
"BROKER_JMX_REMOTE_REGISTRY_SSL": "false",
"KAFKA_OVERRIDE_QUOTA_WINDOW_SIZE_SECONDS": "1",
"FRAMEWORK_PRINCIPAL": "kafka-principal",
"KAFKA_OVERRIDE_REPLICA_HIGH_WATERMARK_CHECKPOINT_INTERVAL_MS": "5000",
"KAFKA_OVERRIDE_LOG_RETENTION_HOURS": "168",
"RECOVERY_GRACE_PERIOD_SEC": "1200",
"BROKER_HEALTH_CHECK_INTERVAL_SEC": "10",
"KAFKA_OVERRIDE_CONNECTIONS_MAX_IDLE_MS": "600000",
"KAFKA_OVERRIDE_LOG_INDEX_INTERVAL_BYTES": "4096",
"KAFKA_OVERRIDE_RESERVED_BROKER_MAX_ID": "1000",
"KAFKA_OVERRIDE_LOG_CLEANER_BACKOFF_MS": "15000",
"REPLACE_DELAY_SEC": "600",
"KAFKA_OVERRIDE_MESSAGE_MAX_BYTES": "1000012",
"KAFKA_OVERRIDE_MAX_CONNECTIONS_PER_IP": "2147483647",
"KAFKA_OVERRIDE_LOG_CLEANER_MIN_CLEANABLE_RATIO": "0.5",
"KAFKA_OVERRIDE_LOG_ROLL_HOURS": "168",
"DISK_TYPE": "ROOT",
"KAFKA_OVERRIDE_LOG_INDEX_SIZE_MAX_BYTES": "10485760",
"KAFKA_OVERRIDE_QUOTA_WINDOW_NUM": "11",
"KAFKA_OVERRIDE_REPLICA_FETCH_MIN_BYTES": "1",
"KAFKA_OVERRIDE_REQUEST_TIMEOUT_MS": "30000",
"KAFKA_OVERRIDE_LOG_FLUSH_INTERVAL_MESSAGES": "9223372036854775807",
"KAFKA_OVERRIDE_LOG_RETENTION_CHECK_INTERVAL_MS": "300000",
"EXECUTOR_URI": "https://downloads.mesosphere.com/kafka/assets/1.1.19-0.10.1.0/executor.zip",
"KAFKA_OVERRIDE_ZOOKEEPER_SYNC_TIME_MS": "2000",
"KAFKA_OVERRIDE_REPLICA_LAG_TIME_MAX_MS": "10000",
"KAFKA_OVERRIDE_NUM_NETWORK_THREADS": "3",
"KAFKA_OVERRIDE_LOG_MESSAGE_FORMAT_VERSION": "0.10.0",
"FRAMEWORK_NAME": "kafka",
"KAFKA_URI": "https://downloads.mesosphere.com/kafka/assets/kafka_2.11-0.10.1.0.tgz",
"KAFKA_OVERRIDE_CONTROLLED_SHUTDOWN_RETRY_BACKOFF_MS": "5000",
"KAFKA_OVERRIDE_NUM_PARTITIONS": "1",
"BROKER_HEALTH_CHECK_TIMEOUT_SEC": "20",
"ENABLE_REPLACEMENT": "false",
"KAFKA_OVERRIDE_GROUP_MAX_SESSION_TIMEOUT_MS": "300000",
"KAFKA_OVERRIDE_REPLICA_FETCH_MAX_BYTES": "1048576",
"KAFKA_OVERRIDE_CONTROLLED_SHUTDOWN_ENABLE": "true",
"KAFKA_OVERRIDE_METRICS_SAMPLE_WINDOW_MS": "30000",
"KAFKA_OVERRIDE_NUM_IO_THREADS": "8",
"KAFKA_OVERRIDE_LOG_SEGMENT_BYTES": "1073741824",
"KAFKA_OVERRIDE_LOG_PREALLOCATE": "false",
"KAFKA_OVERRIDE_LOG_RETENTION_BYTES": "-1",
"KAFKA_OVERRIDE_REPLICA_FETCH_WAIT_MAX_MS": "500",
"KAFKA_OVERRIDE_FETCH_PURGATORY_PURGE_INTERVAL_REQUESTS": "1000",
"KAFKA_OVERRIDE_LOG_CLEANUP_POLICY": "delete",
"BROKER_DISK": "5000",
"KAFKA_ZOOKEEPER_URI": "master.mesos:2181",
"KAFKA_OVERRIDE_REPLICA_FETCH_BACKOFF_MS": "1000",
"KAFKA_OVERRIDE_REPLICA_SOCKET_RECEIVE_BUFFER_BYTES": "65536",
"KAFKA_VER_NAME": "kafka_2.11-0.10.1.0"
},
"healthChecks": [
{
"gracePeriodSeconds": 120,
"intervalSeconds": 30,
"timeoutSeconds": 5,
"maxConsecutiveFailures": 0,
"portIndex": 0,
"path": "/admin/healthcheck",
"protocol": "HTTP",
"ignoreHttp1xx": false
}
],
"labels": {
"DCOS_PACKAGE_RELEASE": "39",
"DCOS_SERVICE_SCHEME": "http",
"DCOS_PACKAGE_SOURCE": "https://universe.mesosphere.com/repo",
"DCOS_PACKAGE_COMMAND": "eyJwaXAiOlsiaHR0cHM6Ly9kb3dubG9hZHMubWVzb3NwaGVyZS5jb20va2Fma2EvYXNzZXRzLzEuMS4xOS0wLjEwLjEuMC9iaW5fd3JhcHBlci0wLjAuMS1weTIucHkzLW5vbmUtYW55LndobCJdfQ==",
"DCOS_PACKAGE_METADATA": "eyJwYWNrYWdpbmdWZXJzaW9uIjoiMy4wIiwibmFtZSI6ImthZmthIiwidmVyc2lvbiI6IjEuMS4xOS4xLTAuMTAuMS4wIiwibWFpbnRhaW5lciI6InN1cHBvcnRAbWVzb3NwaGVyZS5pbyIsImRlc2NyaXB0aW9uIjoiQXBhY2hlIEthZmthIHJ1bm5pbmcgb24gREMvT1MiLCJ0YWdzIjpbIm1lc3NhZ2UiLCJicm9rZXIiLCJwdWJzdWIiXSwic2VsZWN0ZWQiOmZhbHNlLCJmcmFtZXdvcmsiOnRydWUsInBvc3RJbnN0YWxsTm90ZXMiOiJEQy9PUyBLYWZrYSBTZXJ2aWNlIGlzIGJlaW5nIGluc3RhbGxlZC5cblxuXHREb2N1bWVudGF0aW9uOiBodHRwczovL2RvY3MubWVzb3NwaGVyZS5jb20vY3VycmVudC91c2FnZS9zZXJ2aWNlLWd1aWRlcy9rYWZrYS9cblx0SXNzdWVzOiBodHRwczovL2Rjb3NqaXJhLmF0bGFzc2lhbi5uZXQvcHJvamVjdHMvS0FGS0EvaXNzdWVzIiwicG9zdFVuaW5zdGFsbE5vdGVzIjoiREMvT1MgS2Fma2EgU2VydmljZSBoYXMgYmVlbiB1bmluc3RhbGxlZC5cblBsZWFzZSBmb2xsb3cgdGhlIGluc3RydWN0aW9ucyBhdCBodHRwczovL2RvY3MubWVzb3NwaGVyZS5jb20vY3VycmVudC91c2FnZS9zZXJ2aWNlLWd1aWRlcy9rYWZrYS91bmluc3RhbGwgdG8gcmVtb3ZlIGFueSBwZXJzaXN0ZW50IHN0YXRlIGlmIHJlcXVpcmVkLiIsImltYWdlcyI6eyJpY29uLXNtYWxsIjoiaHR0cHM6Ly9kb3dubG9hZHMubWVzb3NwaGVyZS5jb20vdW5pdmVyc2UvYXNzZXRzL2ljb24tc2VydmljZS1rYWZrYS1zbWFsbC5wbmciLCJpY29uLW1lZGl1bSI6Imh0dHBzOi8vZG93bmxvYWRzLm1lc29zcGhlcmUuY29tL3VuaXZlcnNlL2Fzc2V0cy9pY29uLXNlcnZpY2Uta2Fma2EtbWVkaXVtLnBuZyIsImljb24tbGFyZ2UiOiJodHRwczovL2Rvd25sb2Fkcy5tZXNvc3BoZXJlLmNvbS91bml2ZXJzZS9hc3NldHMvaWNvbi1zZXJ2aWNlLWthZmthLWxhcmdlLnBuZyJ9fQ==",
"DCOS_PACKAGE_REGISTRY_VERSION": "3.0",
"DCOS_SERVICE_NAME": "kafka",
"DCOS_PACKAGE_FRAMEWORK_NAME": "kafka",
"DCOS_SERVICE_PORT_INDEX": "1",
"DCOS_PACKAGE_VERSION": "1.1.19.1-0.10.1.0",
"DCOS_MIGRATION_API_PATH": "/v1/plan",
"DCOS_PACKAGE_NAME": "kafka",
"MARATHON_SINGLE_INSTANCE_APP": "true",
"DCOS_PACKAGE_IS_FRAMEWORK": "true",
"DCOS_MIGRATION_API_VERSION": "v1"
},
"portDefinitions": [
{
"port": 10001,
"protocol": "tcp",
"name": "health",
"labels": {}
},
{
"port": 10002,
"protocol": "tcp",
"name": "api",
"labels": {}
}
],
"uris": [
"https://downloads.mesosphere.com/java/jre-8u121-linux-x64.tar.gz",
"https://downloads.mesosphere.com/kafka/assets/1.1.19-0.10.1.0/scheduler.zip",
"https://downloads.mesosphere.com/kafka/assets/kafka_2.11-0.10.1.0.tgz",
"https://downloads.mesosphere.com/libmesos-bundle/libmesos-bundle-1.9-argus-1.1.x-3.tar.gz"
],
"fetch": [
{
"uri": "https://downloads.mesosphere.com/java/jre-8u121-linux-x64.tar.gz",
"extract": true,
"executable": false,
"cache": false
},
{
"uri": "https://downloads.mesosphere.com/kafka/assets/1.1.19-0.10.1.0/scheduler.zip",
"extract": true,
"executable": false,
"cache": false
},
{
"uri": "https://downloads.mesosphere.com/kafka/assets/kafka_2.11-0.10.1.0.tgz",
"extract": true,
"executable": false,
"cache": false
},
{
"uri": "https://downloads.mesosphere.com/libmesos-bundle/libmesos-bundle-1.9-argus-1.1.x-3.tar.gz",
"extract": true,
"executable": false,
"cache": false
}
],
"readinessChecks": [
{
"name": "kafkaUpdateProgress",
"protocol": "HTTP",
"path": "/v1/plan",
"portName": "api",
"intervalSeconds": 30,
"timeoutSeconds": 10,
"httpStatusCodesForReady": [
200
],
"preserveLastResponse": true
}
],
"upgradeStrategy": {
"minimumHealthCapacity": 0,
"maximumOverCapacity": 0
}
}
example app that can't run on second node too:
{
"id": "/sleepscript",
"cmd": " while :; do echo 'Hit CTRL+C'; sleep 1; done",
"cpus": 1,
"mem": 128,
"disk": 0,
"instances": 0,
"acceptedResourceRoles": [
"*"
],
"portDefinitions": [
{
"port": 10000,
"protocol": "tcp",
"labels": {}
}
]
}
#edit2:
These are some interesting lines from logs, I'm trying to run few instances of the same task so first node is used at 100%( *.11 node) and second node is empty( *.12 node):
Sep 04 09:07:01 master-1 java[2406]: [2017-09-04 09:07:01,682] INFO Finished processing e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1269 from 10.132.0.11. Matched 0 ops after 1 passes. ports(*) 1025->2180,2182->3887,3889->5049,5052->807
9,8082->8180,8182->9380,9383->29823,29825->32000; disk(*) 50100.0; disk(*) 6270.0; mem(*) 8639.0 left. (mesosphere.marathon.core.matcher.manager.impl.OfferMatcherManagerActor:marathon-akka.actor.default-dispatcher-81)
Sep 04 09:07:01 master-1 java[2406]: [2017-09-04 09:07:01,682] INFO Finished processing e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1268 from 10.132.0.12. Matched 0 ops after 1 passes. ports(*) 1026->2180,2182->3887,3889->5049,5052->807
9,8082->8180,8182->9099,9101->32000; disk(*) 50100.0; disk(*) 1263.0; cpus(*) 0.5; mem(*) 9279.0 left. (mesosphere.marathon.core.matcher.manager.impl.OfferMatcherManagerActor:marathon-akka.actor.default-dispatcher-81)
Sep 04 09:07:01 master-1 mesos-master[1963]: I0904 09:07:01.683557 1994 master.cpp:4732] Processing DECLINE call for offers: [ e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1269 ] for framework a5370699-baae-4e77-a48e-1283d36b1906-0000 (m
arathon) at scheduler-88e19c66-42c6-48be-be03-662a9312cd91#10.132.0.10:15101
Sep 04 09:07:01 master-1 mesos-master[1963]: I0904 09:07:01.683799 1994 master.cpp:4732] Processing DECLINE call for offers: [ e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1268 ] for framework a5370699-baae-4e77-a48e-1283d36b1906-0000 (m
arathon) at scheduler-88e19c66-42c6-48be-be03-662a9312cd91#10.132.0.10:15101
In second mode, there's this line different from first node: cpus(*) 0.5;
I guess that first node shows now CPUs because all are used, however second one shows 0.5 CPU, does it mean half CPU?
These are first offers where 0.11 node accepts one offer and runs one instance:
Sep 04 09:06:56 master-1 mesos-master[1963]: I0904 09:06:56.671070 1998 master.cpp:7029] Sending 2 offers to framework a5370699-baae-4e77-a48e-1283d36b1906-0000 (marathon) at scheduler-88e19c66-42c6-48be-be03-662a9312cd91#10.132.0
.10:15101
Sep 04 09:06:56 master-1 java[2406]: [2017-09-04 09:06:56,675] INFO Offer [e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1266]. Considering resources with roles {*} without resident reservation labels. Not all basic resources satisfied: c
pus NOT SATISFIED (1.0 > 0.5), mem SATISFIED (128.0 <= 128.0) (mesosphere.mesos.ResourceMatcher$:marathon-akka.actor.default-dispatcher-75)
Sep 04 09:06:56 master-1 java[2406]: [2017-09-04 09:06:56,675] INFO Offer [e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1265]. Considering resources with roles {*} without resident reservation labels. Not all basic resources satisfied: c
pus NOT SATISFIED (1.0 > 0.0), mem SATISFIED (128.0 <= 128.0) (mesosphere.mesos.ResourceMatcher$:marathon-akka.actor.default-dispatcher-75)
Sep 04 09:06:56 master-1 java[2406]: [2017-09-04 09:06:56,676] INFO Finished processing e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1266 from 10.132.0.12. Matched 0 ops after 1 passes. ports(*) 1026->2180,2182->3887,3889->5049,5052->807
9,8082->8180,8182->9099,9101->32000; disk(*) 50100.0; disk(*) 1263.0; cpus(*) 0.5; mem(*) 9279.0 left. (mesosphere.marathon.core.matcher.manager.impl.OfferMatcherManagerActor:marathon-akka.actor.default-dispatcher-75)
Sep 04 09:06:56 master-1 java[2406]: [2017-09-04 09:06:56,676] INFO Finished processing e74ab83d-b856-4c89-a5fb-0eccb4e74268-O1265 from 10.132.0.11. Matched 1 ops after 2 passes. ports(*) 1025->2180,2182->3887,3889->5049,5052->807
9,8082->8180,8182->9380,9383->29823,29825->32000; disk(*) 50100.0; disk(*) 6270.0; mem(*) 8639.0 left. (mesosphere.marathon.core.matcher.manager.impl.OfferMatcherManagerActor:marathon-akka.actor.default-dispatcher-75)
Still, it shows like *.11 node has 0 CPU, but the last log shows that it passed and later there are logs with ID of running task.
#edit3:
Bingo, after creating tasks using 0.1 CPU finally *.12 node is used using all that 0.5 CPU it has. That node also can't run more than two 0.2 CPU tasks at once, so it seems that this 0.5 CPU is limiting it. It seems that something restricts it to 0.5 CPU but I have no idea what.
#edit4:
DCOS shows all 2 CPUs for that node, mesos UI shows also 2 CPUs as total, /proc/cpuinfo also shows two cores.
#edit5:
I "fixed" the problem by deleting /var/lib/dcos/mesos-resources and entire content of /var/lib/mesos/slave/meta directory. However I don't know what caused this problem in first place.
#anotherEdit
mesos/master/slaves content when working incorectly:
{"slaves":[{"id":"6c620a26-7bc6-4287-b67a-de2b0eb8778c-S3","hostname":"10.132.0.12","port":5051,"attributes":{},"pid":"slave(1)#10.132.0.12:5051","registered_time":1504683660.37257,"reregistered_time":1504683660.37296,"resources":{"disk":52824.0,"mem":11839.0,"gpus":0.0,"cpus":2.0,"ports":"[1025-2180, 2182-3887, 3889-5049, 5052-8079, 8082-8180, 8182-32000]"},"used_resources":{"disk":50100.0,"mem":7168.0,"gpus":0.0,"cpus":1.3,"ports":"[7000-7001, 7199-7199, 9000-9001, 9042-9042, 9160-9160]"},"offered_resources":{"disk":0.0,"mem":0.0,"gpus":0.0,"cpus":0.0},"reserved_resources":{"cassandra-role":{"disk":50100.0,"mem":5376.0,"gpus":0.0,"cpus":1.5,"ports":"[7000-7001, 7199-7199, 9001-9001, 9042-9042, 9160-9160]"}},"unreserved_resources":{"disk":2724.0,"mem":6463.0,"gpus":0.0,"cpus":0.5,"ports":"[1025-2180, 2182-3887, 3889-5049, 5052-6999, 7002-7198, 7200-8079, 8082-8180, 8182-9000, 9002-9041, 9043-9159, 9161-32000]"},"active":true,"version":"1.2.2","reserved_resources_full":{"cassandra-role":[{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"5827371e-2139-4278-99b5-85e7dd573f4c"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":4096.0},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"c5c881b0-45df-458b-9731-11cfb1b251c3"}]}}},{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"1fed2809-f03e-4600-bfa7-b83a382dafbb"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":1024.0},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"3fa104ed-8fec-47ce-9702-6cde32bc97de"}]}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":9001,"end":9001}]},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"072d9c13-4205-4ab0-bff2-59e2604774c1"}]}}},{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"c06733c6-fe7e-43a1-a3b7-70371d7cae08"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":256.0},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"18b050cf-f45d-4e2d-8e32-b7d7028d4939"}]}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":7000,"end":7001},{"begin":7199,"end":7199},{"begin":9042,"end":9042},{"begin":9160,"end":9160}]},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"f49161b4-071f-45e6-9316-62b81c009f8a"}]}}},{"name":"disk","type":"SCALAR","scalar":{"value":50100.0},"role":"cassandra-role","reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"359a466c-1191-441e-8058-d77c84fbdf0f"}]}},"disk":{"persistence":{"id":"0df2d25d-8c64-42ac-ab65-1de63392b5e8","principal":"cassandra-principal"},"volume":{"mode":"RW","container_path":"volume"},"source":{"type":"MOUNT","mount":{"root":"\/dcos\/volume0"}}}}]},"used_resources_full":[{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"1fed2809-f03e-4600-bfa7-b83a382dafbb"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":1024.0},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"3fa104ed-8fec-47ce-9702-6cde32bc97de"}]}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":9001,"end":9001}]},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"072d9c13-4205-4ab0-bff2-59e2604774c1"}]}}},{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"c06733c6-fe7e-43a1-a3b7-70371d7cae08"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":4096.0},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"c5c881b0-45df-458b-9731-11cfb1b251c3"}]}}},{"name":"disk","type":"SCALAR","scalar":{"value":50100.0},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"359a466c-1191-441e-8058-d77c84fbdf0f"}]}},"disk":{"persistence":{"id":"0df2d25d-8c64-42ac-ab65-1de63392b5e8","principal":"cassandra-principal"},"volume":{"mode":"RW","container_path":"volume"},"source":{"type":"MOUNT","mount":{"root":"\/dcos\/volume0"}}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":7000,"end":7001},{"begin":7199,"end":7199},{"begin":9042,"end":9042},{"begin":9160,"end":9160}]},"role":"cassandra-role","allocation_info":{"role":"cassandra-role"},"reservation":{"principal":"cassandra-principal","labels":{"labels":[{"key":"resource_id","value":"f49161b4-071f-45e6-9316-62b81c009f8a"}]}}},{"name":"cpus","type":"SCALAR","scalar":{"value":0.3},"role":"*","allocation_info":{"role":"slave_public"}},{"name":"mem","type":"SCALAR","scalar":{"value":2048.0},"role":"*","allocation_info":{"role":"slave_public"}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":9000,"end":9000}]},"role":"*","allocation_info":{"role":"slave_public"}}],"offered_resources_full":[]},{"id":"6c620a26-7bc6-4287-b67a-de2b0eb8778c-S2","hostname":"10.132.0.11","port":5051,"attributes":{},"pid":"slave(1)#10.132.0.11:5051","registered_time":1504683660.32389,"reregistered_time":1504683660.32425,"resources":{"disk":56370.0,"mem":11839.0,"gpus":0.0,"cpus":2.0,"ports":"[1025-2180, 2182-3887, 3889-5049, 5052-8079, 8082-8180, 8182-32000]"},"used_resources":{"disk":512.0,"mem":2048.0,"gpus":0.0,"cpus":0.6,"ports":"[23803-23805]"},"offered_resources":{"disk":0.0,"mem":0.0,"gpus":0.0,"cpus":0.0},"reserved_resources":{"kafka-role":{"disk":5000.0,"mem":2560.0,"gpus":0.0,"cpus":1.0,"ports":"[1025-1025, 9100-9100]"}},"unreserved_resources":{"disk":51370.0,"mem":9279.0,"gpus":0.0,"cpus":1.0,"ports":"[1026-2180, 2182-3887, 3889-5049, 5052-8079, 8082-8180, 8182-9099, 9101-32000]"},"active":true,"version":"1.2.2","reserved_resources_full":{"kafka-role":[{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"b722808d-79a2-4871-8c97-e95a2c847fbd"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":256.0},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"6eb2865d-f823-473f-8c69-49a72eb7dbf6"}]}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":1025,"end":1025}]},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"dynamic_port","value":"API_PORT"},{"key":"resource_id","value":"1deb5417-7852-49fb-ac81-fceb073369a8"}]}}},{"name":"cpus","type":"SCALAR","scalar":{"value":0.5},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"fe21a9d8-0ba0-4d82-9bae-ca252d72bf93"}]}}},{"name":"mem","type":"SCALAR","scalar":{"value":2304.0},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"1beca7e2-bf10-43be-be28-0f93b03eac37"}]}}},{"name":"disk","type":"SCALAR","scalar":{"value":5000.0},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"d7e9cb00-2204-481a-a999-cec4fee5d248"}]}},"disk":{"persistence":{"id":"661b8d81-8d33-432d-bd47-5538718730f9","principal":"kafka-principal"},"volume":{"mode":"RW","container_path":"kafka-volume-11c8a735-646f-4c46-9af5-ac2cbc52b697"}}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":9100,"end":9100}]},"role":"kafka-role","reservation":{"principal":"kafka-principal","labels":{"labels":[{"key":"resource_id","value":"99c8303c-1892-4fb8-bd0f-8618ad877654"}]}}}]},"used_resources_full":[{"name":"cpus","type":"SCALAR","scalar":{"value":0.6},"role":"*","allocation_info":{"role":"slave_public"}},{"name":"mem","type":"SCALAR","scalar":{"value":2048.0},"role":"*","allocation_info":{"role":"slave_public"}},{"name":"disk","type":"SCALAR","scalar":{"value":512.0},"role":"*","allocation_info":{"role":"slave_public"}},{"name":"ports","type":"RANGES","ranges":{"range":[{"begin":23803,"end":23805}]},"role":"*","allocation_info":{"role":"slave_public"}}],"offered_resources_full":[]}],"recovered_slaves":[]}

Related

'Create service' for a Cluster with Ec2 type is giving error in AWS console

Trying out sample ECS with EC2 type in AWS free tier.
Created a Cluster for ec2 instance.
Then created task-def for ec2 resources with image uri - public.ecr.aws/ubuntu/nginx:latest, OS : Linux/X86_64, image t2.micro.
While creating/deploying the service, getting error on selecting the task-def created.
There was an error deploying nginx-service
Resource handler returned message: "Error occurred during operation 'ECS Deployment Circuit Breaker was triggered'." (RequestToken: 1ab71394-b41e-190a-df10-6a87d62a7915, HandlerErrorCode: GeneralServiceException)
task-def-json
{
"taskDefinitionArn": "arn:aws:ecs:ap-northeast-1:930446195568:task-definition/ecs-task-def:1",
"containerDefinitions": [
{
"name": "nginx",
"image": "public.ecr.aws/ubuntu/nginx:latest",
"cpu": 0,
"portMappings": [
{
"name": "nginx-80-tcp",
"containerPort": 80,
"hostPort": 80,
"protocol": "tcp",
"appProtocol": "http"
}
],
"essential": true,
"environment": [],
"environmentFiles": [],
"mountPoints": [],
"volumesFrom": []
}
],
"family": "ecs-task-def",
"executionRoleArn": "arn:aws:iam::930446195568:role/ecsTaskExecutionRole",
"networkMode": "bridge",
"revision": 1,
"volumes": [],
"status": "ACTIVE",
"placementConstraints": [],
"compatibilities": [
"EC2"
],
"requiresCompatibilities": [
"EC2"
],
"cpu": "512",
"memory": "1024",
"runtimePlatform": {
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"
},
"registeredAt": "2023-02-15T17:11:45.596Z",
"registeredBy": "arn:aws:iam::930446195568:user/admin_user",
"tags": []
}

Kafka consumer messages getting deleted before specified retention period

So I created a kafka topic and the messages are getting deleted in just 24 hours.. I need it to stay as long as specified retention.ms (28 days).. Here is config:
{
"name": "foo",
"partitions": 1,
"replicas": 3,
"retention": 2419200000,
"cleanupPolicies": [
"delete"
],
"configuration": {
"compression.type": "producer",
"min.insync.replicas": "2",
"message.downconversion.enable": "true",
"segment.jitter.ms": "0",
"cleanup.policy": "delete",
"flush.ms": "9223372036854775800",
"segment.bytes": "1073741824",
"retention.ms": "2419200000",
"flush.messages": "9223372036854775800",
"message.format.version": "2.8-IV1",
"max.compaction.lag.ms": "9223372036854775800",
"file.delete.delay.ms": "60000",
"max.message.bytes": "2000000",
"min.compaction.lag.ms": "0",
"message.timestamp.type": "CreateTime",
"preallocate": "false",
"index.interval.bytes": "4096",
"min.cleanable.dirty.ratio": "0.5",
"unclean.leader.election.enable": "true",
"retention.bytes": "-1",
"delete.retention.ms": "86400000",
"segment.ms": "604800000",
"message.timestamp.difference.max.ms": "9223372036854775800",
"segment.index.bytes": "10485760"
}
}
This is 24 hours
"delete.retention.ms": "86400000",

Convert 'connections.json' file to 'tnsnames.ora'

Is there a way to convert 'connections.json' file to 'tnsnames.ora'?
Well i have approx 50 database connections and i plan not to write them up 1 by 1 in a notepad or something. Is there a much simpler way?
Im using Sqldeveloper Version: 20.2.0.175.1842
Below is the quarter of my "connections.json" file. The file is very huge.
{
"connections": [{
"info": {
"role": "",
"SavePassword": "true",
"OracleConnectionType": "BASIC",
"RaptorConnectionType": "Oracle",
"serviceName": "ABC",
"Connection-Color-For-Editors": "-12417793",
"customUrl": "jdbc:oracle:thin:#//a.b.c.d:1526/BDNPDBP",
"oraDriverType": "thin",
"NoPasswordConnection": "TRUE",
"password": "aaa/bbb/ccc=",
"hostname": "a.b.c.d",
"driver": "oracle.jdbc.OracleDriver",
"port": "1526",
"subtype": "oraJDBC",
"OS_AUTHENTICATION": "false",
"ConnName": "BDAPDPBP-SGP-PRD_FAB",
"KERBEROS_AUTHENTICATION": "false",
"user": "ABC"
},
"name": "BDAPDPBP-SGP-PRD_FAB",
"type": "jdbc"
}, ...

How to push logs in plain text format using fluentd to AWS cloudwatch?

I want to send logs from Kubernetes pods to cloudwatch in plain text format. Is there a way to achieve this?
I am able to send logs in JSON format using fluent/fluentd-kubernetes-daemonset:v1.3.3-debian-cloudwatch-1.0 but I am not sure how to send it in plain text format. I will really appreciate any help.
Current log :
{
"log": "2020-04-22 08:00:00.002 WARN 7 --- [viderScheduler1] c.o.c.a.p.c.SchedulerConfiguration : HELLO\n",
"stream": "stdout",
"docker": {
"container_id": "ASDF"
},
"kubernetes": {
"container_name": "CHART",
"namespace_name": "NAMESPACE",
"pod_name": "POD_NAME",
"container_image": "IMAGE",
"container_image_id": "IMAGE_ID",
"pod_id": "ID",
"labels": {
"app": "app",
"date": "1587541693",
"environment": "env",
"pod-template-hash": "bcbcb",
"release": "asdf",
"tier": "asdf",
"version": "v1.2.3"
},
"host": "IP",
"master_url": "master_URL",
"namespace_id": "ns"
}
}
Expected log :
2020-04-22 08:00:00.002 WARN 7 --- [viderScheduler1] c.o.c.a.p.c.SchedulerConfiguration : HELLO\n

What is the DNS Record logged by kubedns?

I'm using Google Container Engine and I'm noticing entries like the following in my logs
{
"insertId": "1qfzyonf2z1q0m",
"internalId": {
"projectNumber": "1009253435077"
},
"labels": {
"compute.googleapis.com/resource_id": "710923338689591312",
"compute.googleapis.com/resource_name": "fluentd-cloud-logging-gke-gas2016-4fe456307445d52d-worker-pool-",
"compute.googleapis.com/resource_type": "instance",
"container.googleapis.com/cluster_name": "gas2016-4fe456307445d52d",
"container.googleapis.com/container_name": "kubedns",
"container.googleapis.com/instance_id": "710923338689591312",
"container.googleapis.com/namespace_name": "kube-system",
"container.googleapis.com/pod_name": "kube-dns-v17-e4rr2",
"container.googleapis.com/stream": "stderr"
},
"logName": "projects/cml-236417448818/logs/kubedns",
"resource": {
"labels": {
"cluster_name": "gas2016-4fe456307445d52d",
"container_name": "kubedns",
"instance_id": "710923338689591312",
"namespace_id": "kube-system",
"pod_id": "kube-dns-v17-e4rr2",
"zone": "us-central1-f"
},
"type": "container"
},
"severity": "ERROR",
"textPayload": "I0718 17:05:20.552572 1 dns.go:660] DNS Record:&{worker-7.default.svc.cluster.local. 6000 10 10 false 30 0 }, hash:f97f8525\n",
"timestamp": "2016-07-18T17:05:20.000Z"
}
Is this an actual error or is the severity incorrect? Where can I find the definition for the struct that is being printed?
The severity is incorrect. This is some tracing/debugging that shouldn't have been left in the binary, and has been removed since 1.3 was cut. It will be removed in a future release.
See also: Google container engine cluster showing large number of dns errors in logs