Audit and Traceability¶
Configuration Management¶
Git¶
Each user has a copy of the configuration in his operator environment. He can make change locally, test and then synchronize the platform configuration. To manage conflicts, all the punch configuration is tracked by git.
Info
Git is a version-control system for tracking changes in computer files and coordinating work on those files among multiple people. It is primarily used for source-code management in software development, but it can be used to keep track of changes in any set of files. As a distributed revision-control system, it is aimed at speed, data integrity, and support for distributed, non-linear workflows.
Zookeeper¶
To monitor the health of the platform and all the running applications, it is crucial to first make sure what is the precise running configuration. For instance: how many channels and punchlines are running ? How many nodes are deployed in the elasticsearch cluster ?
The punch administrative commands such as channelctl internally use zookeeper to store the last running configurations. The channelctl provides online help and debug facilities to easily question the current status of a tenant.
Centralized Monitoring¶
Platform monitoring¶
The punch integrates components to automatically compute the runtime health and status of every component at play. The resulting status documents are stored in turn in elasticsearch and are exposed to third-party monitoring tools.
Here is an example of the type of monitoring documents:
{
"@timestamp": "2018-11-02T14:11:29.195Z",
"storm": {
"health_code": 0,
"health_name": "green",
"clusters": {
"main": {
"nimbus": {
"hosts": {
"server4": {
"health_code": 0,
"health_name": "green",
"details": {
"port": 6627,
"host": "server4",
"nimbusLogLink": "http://server4:8000/daemonlog?file=nimbus.log",
"nimbusUpTime": "5d 17h 27m 35s",
"nimbusUpTimeSeconds": 494855,
"version": "1.1.1",
"status": "Leader"
}
}
}
},
"cluster": {
"health_code": 0,
"health_name": "green",
"details": {
"stormVersion": "1.1.1",
"executorsTotal": 22,
"totalMem": 12288,
"availCpu": 200,
"slotsTotal": 20,
"bugtracker-url": "null",
"slotsUsed": 6,
"topologies": 6,
"totalCpu": 200,
"cpuAssignedPercentUtil": "0.0",
"availMem": 10648,
"name": "main",
"slotsFree": 14,
"memAssignedPercentUtil": "13.3",
"central-log-url": "null",
"user": "null",
"tasksTotal": 22,
"schedulerDisplayResource": false,
"supervisors": 2
}
},
"health_code": 0,
"health_name": "green",
"supervisor": {
"hosts": {
"server6": {
"health_code": 0,
"health_name": "green",
"details": {
"logLink": "http://server6:8000/daemonlog?file=supervisor.log",
"totalMem": 6144,
"availCpu": 100,
"usedMem": 678,
"slotsTotal": 10,
"slotsUsed": 2,
"version": "1.1.1",
"uptimeSeconds": 495099,
"uptime": "5d 17h 31m 39s",
"totalCpu": 100,
"availMem": 5466,
"host": "server6",
"slotsFree": 8,
"id": "c026c4ff-23ce-4094-a229-284e036d0ec2",
"usedCpu": 0
}
},
"server4": {
"health_code": 0,
"health_name": "green",
"details": {
"logLink": "http://server4:8000/daemonlog?file=supervisor.log",
"totalMem": 6144,
"availCpu": 100,
"usedMem": 962,
"slotsTotal": 10,
"slotsUsed": 4,
"version": "1.1.1",
"uptimeSeconds": 495074,
"uptime": "5d 17h 31m 14s",
"totalCpu": 100,
"availMem": 5182,
"host": "server4",
"slotsFree": 6,
"id": "276af295-14b7-4de1-84f6-0611a39a482e",
"usedCpu": 0
}
}
}
}
}
}
},
"elasticsearch": {
"health_code": 0,
"health_name": "green",
"clusters": {
"es_search": {
"health_code": 0,
"health_name": "green",
"details": {
"number_of_pending_tasks": 0,
"cluster_name": "es_search",
"active_shards": 538,
"active_primary_shards": 269,
"unassigned_shards": 0,
"delayed_unassigned_shards": 0,
"timed_out": false,
"relocating_shards": 0,
"initializing_shards": 0,
"task_max_waiting_in_queue_millis": 0,
"number_of_data_nodes": 3,
"number_of_in_flight_fetch": 0,
"active_shards_percent_as_number": 100,
"status": "green",
"number_of_nodes": 3
}
}
}
},
"zookeeper": {
"health_code": 0,
"health_name": "green",
"clusters": {
"common": {
"health_code": 0,
"hosts": {
"server6": {
"health_code": 0,
"health_name": "green",
"details": {
"zk_packets_sent": 9074725,
"zk_max_latency": 16859,
"zk_server_state": "follower",
"zk_outstanding_requests": 0,
"zk_ephemerals_count": 33,
"zk_min_latency": 0,
"zk_max_file_descriptor_count": 100000,
"zk_approximate_data_size": 3459895,
"zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
"zk_avg_latency": 9,
"zk_znode_count": 1142,
"zk_watch_count": 5290,
"zk_open_file_descriptor_count": 49,
"zk_packets_received": 7964841,
"zk_num_alive_connections": 9
}
},
"server5": {
"health_code": 0,
"health_name": "green",
"details": {
"zk_followers": 2,
"zk_packets_sent": 283129945,
"zk_pending_syncs": 0,
"zk_synced_followers": 2,
"zk_max_latency": 669680,
"zk_server_state": "leader",
"zk_outstanding_requests": 0,
"zk_ephemerals_count": 33,
"zk_min_latency": 0,
"zk_max_file_descriptor_count": 100000,
"zk_approximate_data_size": 3459898,
"zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
"zk_avg_latency": 14,
"zk_znode_count": 1142,
"zk_watch_count": 159332,
"zk_open_file_descriptor_count": 50,
"zk_packets_received": 278374614,
"zk_num_alive_connections": 8
}
},
"server4": {
"health_code": 0,
"health_name": "green",
"details": {
"zk_packets_sent": 27723571,
"zk_max_latency": 344801,
"zk_server_state": "follower",
"zk_outstanding_requests": 0,
"zk_ephemerals_count": 33,
"zk_min_latency": 0,
"zk_max_file_descriptor_count": 100000,
"zk_approximate_data_size": 3459898,
"zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
"zk_avg_latency": 20,
"zk_znode_count": 1142,
"zk_watch_count": 85213,
"zk_open_file_descriptor_count": 50,
"zk_packets_received": 26959577,
"zk_num_alive_connections": 10
}
}
},
"health_name": "green"
}
}
},
"kafka": {
"health_code": 2,
"health_name": "red",
"clusters": {
"local": {
"brokers": {
"1": {
"health_code": 0,
"health_name": "green",
"details": {
"listener_security_protocol_map": {
"PLAINTEXT": "PLAINTEXT"
},
"endpoints": [
"PLAINTEXT://server4:9092"
],
"jmx_port": -1,
"port": 9092,
"host": "server4",
"id": "1",
"version": 4,
"timestamp": "1539681285784"
}
},
"2": {
"health_code": 0,
"health_name": "green",
"details": {
"listener_security_protocol_map": {
"PLAINTEXT": "PLAINTEXT"
},
"endpoints": [
"PLAINTEXT://server5:9092"
],
"jmx_port": -1,
"port": 9092,
"host": "server5",
"id": "2",
"version": 4,
"timestamp": "1534301326278"
}
},
"3": {
"health_code": 2,
"health_name": "red",
"details": {
"id": "3"
}
}
},
"health_code": 2,
"health_name": "red"
}
}
},
"shiva": {
"health_code": 1,
"health_name": "yellow",
"clusters": {
"shivamain": {
"health_code": 1,
"health_name": "yellow",
"details": {
"cluster_name": "shivamain",
"cluster": {
"election_timestamp": "2018-10-06T18:54:46.869Z",
"assignments": {
"25838@server4": [
"/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/elasticsearch_housekeeping_service",
"/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/kafka_service",
"/shiva/shivamain/tasks/punchplatform/platform/services/admin/elasticsearch_housekeeping_service"
]
},
"leader_id": "25838@server4",
"unassigned_tasks": {}
}
}
}
}
},
"platform": {
"health_code": 2,
"health_name": "red"
}
}
Runtime monitoring¶
Each running component published metrics or logs in an elasticsearch. This allow a user to track stop/start activities or configuration changes. For instance, each component published uptime metric.
platform-<date>
: Contains the platform stack metrics statusshiva-logs-<date>
: Contains the logs (STDOUT & STDIN) of the tasks running in shivapunchplatform
: Full status of the platformmetricbeat-<version>-<date>
: Contains system metrics such as memory, CPU, storage<tenant>-metrics-<date>
: Contains the runtime metrics of storm topologies
Traceability¶
Configuration Change History¶
With git, all the past changes are stored.
At anytime, the history is accessible:
git log
commit f4155ebb3a2524b4e474da6654689f9c8005ee74
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date: Mon Oct 15 18:07:30 2018 +0200
update ais
commit 78b9a271873c62fcfb17a468c2acf876133f13a8
Author: Loic jardin <loic.jardin@thalesgroup.com>
Date: Fri Oct 5 15:18:45 2018 +0200
rollback change
commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date: Thu Oct 4 14:52:10 2018 +0200
add operator cedric on livedemo
And to have the complete changes:
git log -p
commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date: Thu Oct 4 14:52:10 2018 +0200
add operator cedric on livedemo
diff --git a/platform/livedemo/generated_inventory/group_vars/git_servers b/platform/livedemo/generated_inventory/group_vars/git_servers
index b488ef7..38c3b04 100644
--- a/platform/livedemo/generated_inventory/group_vars/git_servers
+++ b/platform/livedemo/generated_inventory/group_vars/git_servers
@@ -22,4 +22,6 @@ key_to_copy:
- /tmp/rsa_pub/server4/home/sgrah/.ssh/id_rsa.pub
+ - /tmp/rsa_pub/server4/home/cedric/.ssh/id_rsa.pub
+
diff --git a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
index 9726d0f..94bebc1 100644
--- a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
+++ b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
@@ -38,6 +38,7 @@ operators_username:
- punchplatform
- rpenco
- sgrah
+ - cedric
####### ADMIN USER & GROUP ##########
diff --git a/platform/livedemo/livedemo-punchplatform-deployment.settings b/platform/livedemo/livedemo-punchplatform-deployment.settings
index 54b850c..9d29a8e 100644
--- a/platform/livedemo/livedemo-punchplatform-deployment.settings
+++ b/platform/livedemo/livedemo-punchplatform-deployment.settings
@@ -26,7 +26,7 @@
"punchplatform_operator" : {
"configuration_name_dir_from_home" : "pp-conf-livedemo",
- "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah"],
+ "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah","cedric"],
"servers" : {
"server4" : {}
}
Logs¶
All logs from all application and components are collected and centralised in the platform elasticsearch administrative instance. This provides easy and valuable incent on any abnormal behaviour.
In addition all punch application logs have a well defined key-value format.