Skip to content

Audit and Traceability

Configuration Management

Git

Each user has a copy of the configuration in his operator environment. He can make change locally, test and then synchronize the platform configuration. To manage conflicts, all the punch configuration is tracked by git.

Info

Git is a version-control system for tracking changes in computer files and coordinating work on those files among multiple people. It is primarily used for source-code management in software development, but it can be used to keep track of changes in any set of files. As a distributed revision-control system, it is aimed at speed, data integrity, and support for distributed, non-linear workflows.

Zookeeper

To monitor the health of the platform and all the running applications, it is crucial to first make sure what is the precise running configuration. For instance: how many channels and punchlines are running ? How many nodes are deployed in the elasticsearch cluster ?

The punch administrative commands such as channelctl internally use zookeeper to store the last running configurations. The channelctl provides online help and debug facilities to easily question the current status of a tenant.

Centralized Monitoring

Platform monitoring

The punch integrates components to automatically compute the runtime health and status of every component at play. The resulting status documents are stored in turn in elasticsearch and are exposed to third-party monitoring tools.

Here is an example of the type of monitoring documents:

{
    "@timestamp": "2018-11-02T14:11:29.195Z",
    "storm": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "main": {
          "nimbus": {
            "hosts": {
              "server4": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "port": 6627,
                  "host": "server4",
                  "nimbusLogLink": "http://server4:8000/daemonlog?file=nimbus.log",
                  "nimbusUpTime": "5d 17h 27m 35s",
                  "nimbusUpTimeSeconds": 494855,
                  "version": "1.1.1",
                  "status": "Leader"
                }
              }
            }
          },
          "cluster": {
            "health_code": 0,
            "health_name": "green",
            "details": {
              "stormVersion": "1.1.1",
              "executorsTotal": 22,
              "totalMem": 12288,
              "availCpu": 200,
              "slotsTotal": 20,
              "bugtracker-url": "null",
              "slotsUsed": 6,
              "topologies": 6,
              "totalCpu": 200,
              "cpuAssignedPercentUtil": "0.0",
              "availMem": 10648,
              "name": "main",
              "slotsFree": 14,
              "memAssignedPercentUtil": "13.3",
              "central-log-url": "null",
              "user": "null",
              "tasksTotal": 22,
              "schedulerDisplayResource": false,
              "supervisors": 2
            }
          },
          "health_code": 0,
          "health_name": "green",
          "supervisor": {
            "hosts": {
              "server6": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "logLink": "http://server6:8000/daemonlog?file=supervisor.log",
                  "totalMem": 6144,
                  "availCpu": 100,
                  "usedMem": 678,
                  "slotsTotal": 10,
                  "slotsUsed": 2,
                  "version": "1.1.1",
                  "uptimeSeconds": 495099,
                  "uptime": "5d 17h 31m 39s",
                  "totalCpu": 100,
                  "availMem": 5466,
                  "host": "server6",
                  "slotsFree": 8,
                  "id": "c026c4ff-23ce-4094-a229-284e036d0ec2",
                  "usedCpu": 0
                }
              },
              "server4": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "logLink": "http://server4:8000/daemonlog?file=supervisor.log",
                  "totalMem": 6144,
                  "availCpu": 100,
                  "usedMem": 962,
                  "slotsTotal": 10,
                  "slotsUsed": 4,
                  "version": "1.1.1",
                  "uptimeSeconds": 495074,
                  "uptime": "5d 17h 31m 14s",
                  "totalCpu": 100,
                  "availMem": 5182,
                  "host": "server4",
                  "slotsFree": 6,
                  "id": "276af295-14b7-4de1-84f6-0611a39a482e",
                  "usedCpu": 0
                }
              }
            }
          }
        }
      }
    },
    "elasticsearch": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "es_search": {
          "health_code": 0,
          "health_name": "green",
          "details": {
            "number_of_pending_tasks": 0,
            "cluster_name": "es_search",
            "active_shards": 538,
            "active_primary_shards": 269,
            "unassigned_shards": 0,
            "delayed_unassigned_shards": 0,
            "timed_out": false,
            "relocating_shards": 0,
            "initializing_shards": 0,
            "task_max_waiting_in_queue_millis": 0,
            "number_of_data_nodes": 3,
            "number_of_in_flight_fetch": 0,
            "active_shards_percent_as_number": 100,
            "status": "green",
            "number_of_nodes": 3
          }
        }
      }
    },
    "zookeeper": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "common": {
          "health_code": 0,
          "hosts": {
            "server6": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_packets_sent": 9074725,
                "zk_max_latency": 16859,
                "zk_server_state": "follower",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459895,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 9,
                "zk_znode_count": 1142,
                "zk_watch_count": 5290,
                "zk_open_file_descriptor_count": 49,
                "zk_packets_received": 7964841,
                "zk_num_alive_connections": 9
              }
            },
            "server5": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_followers": 2,
                "zk_packets_sent": 283129945,
                "zk_pending_syncs": 0,
                "zk_synced_followers": 2,
                "zk_max_latency": 669680,
                "zk_server_state": "leader",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459898,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 14,
                "zk_znode_count": 1142,
                "zk_watch_count": 159332,
                "zk_open_file_descriptor_count": 50,
                "zk_packets_received": 278374614,
                "zk_num_alive_connections": 8
              }
            },
            "server4": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_packets_sent": 27723571,
                "zk_max_latency": 344801,
                "zk_server_state": "follower",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459898,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 20,
                "zk_znode_count": 1142,
                "zk_watch_count": 85213,
                "zk_open_file_descriptor_count": 50,
                "zk_packets_received": 26959577,
                "zk_num_alive_connections": 10
              }
            }
          },
          "health_name": "green"
        }
      }
    },
    "kafka": {
      "health_code": 2,
      "health_name": "red",
      "clusters": {
        "local": {
          "brokers": {
            "1": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "listener_security_protocol_map": {
                  "PLAINTEXT": "PLAINTEXT"
                },
                "endpoints": [
                  "PLAINTEXT://server4:9092"
                ],
                "jmx_port": -1,
                "port": 9092,
                "host": "server4",
                "id": "1",
                "version": 4,
                "timestamp": "1539681285784"
              }
            },
            "2": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "listener_security_protocol_map": {
                  "PLAINTEXT": "PLAINTEXT"
                },
                "endpoints": [
                  "PLAINTEXT://server5:9092"
                ],
                "jmx_port": -1,
                "port": 9092,
                "host": "server5",
                "id": "2",
                "version": 4,
                "timestamp": "1534301326278"
              }
            },
            "3": {
              "health_code": 2,
              "health_name": "red",
              "details": {
                "id": "3"
              }
            }
          },
          "health_code": 2,
          "health_name": "red"
        }
      }
    },
    "shiva": {
      "health_code": 1,
      "health_name": "yellow",
      "clusters": {
        "shivamain": {
          "health_code": 1,
          "health_name": "yellow",
          "details": {
            "cluster_name": "shivamain",
            "cluster": {
              "election_timestamp": "2018-10-06T18:54:46.869Z",
              "assignements": {
                "25838@server4": [
                  "/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/elasticsearch_housekeeping_service",
                  "/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/kafka_service",
                  "/shiva/shivamain/tasks/punchplatform/platform/services/admin/elasticsearch_housekeeping_service"
                ]
              },
              "leader_id": "25838@server4",
              "unassigned_tasks": {}
            }
          }
        }
      }
    },
    "platform": {
      "health_code": 2,
      "health_name": "red"
    }
  }

Runtime monitoring

Each running component published metrics or logs in an elasticsearch. This allow a user to track stop/start activities or configuration changes. For instance, each component published uptime metric.

  • platform-<date>: Contains the platform stack metrics status
  • shiva-logs-<date>: Contains the logs (STDOUT & STDIN) of the tasks running in shiva
  • punchplatform: Full status of the platform
  • metricbeat-<version>-<date>: Contains system metrics such as memory, CPU, storage
  • <tenant>-metrics-<date>: Contains the runtime metrics of storm topologies

Traceability

Configuration Change History

With git, all the past changes are stored.

At anytime, the history is accessible:

git log 
commit f4155ebb3a2524b4e474da6654689f9c8005ee74
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Mon Oct 15 18:07:30 2018 +0200

    update ais

commit 78b9a271873c62fcfb17a468c2acf876133f13a8
Author: Loic jardin <loic.jardin@thalesgroup.com>
Date:   Fri Oct 5 15:18:45 2018 +0200

    rollback change

commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Thu Oct 4 14:52:10 2018 +0200

    add operator cedric on livedemo

And to have the complete changes:

git log -p 
commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Thu Oct 4 14:52:10 2018 +0200

    add operator cedric on livedemo

diff --git a/platform/livedemo/generated_inventory/group_vars/git_servers b/platform/livedemo/generated_inventory/group_vars/git_servers
index b488ef7..38c3b04 100644
--- a/platform/livedemo/generated_inventory/group_vars/git_servers
+++ b/platform/livedemo/generated_inventory/group_vars/git_servers
@@ -22,4 +22,6 @@ key_to_copy:

  - /tmp/rsa_pub/server4/home/sgrah/.ssh/id_rsa.pub

+ - /tmp/rsa_pub/server4/home/cedric/.ssh/id_rsa.pub
+

diff --git a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
index 9726d0f..94bebc1 100644
--- a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
+++ b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
@@ -38,6 +38,7 @@ operators_username:
  - punchplatform
  - rpenco
  - sgrah
+ - cedric


 ####### ADMIN USER & GROUP ##########
diff --git a/platform/livedemo/livedemo-punchplatform-deployment.settings b/platform/livedemo/livedemo-punchplatform-deployment.settings
index 54b850c..9d29a8e 100644
--- a/platform/livedemo/livedemo-punchplatform-deployment.settings
+++ b/platform/livedemo/livedemo-punchplatform-deployment.settings
@@ -26,7 +26,7 @@

 "punchplatform_operator" : {
     "configuration_name_dir_from_home" : "pp-conf-livedemo",
-    "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah"],
+    "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah","cedric"],
     "servers" : {
         "server4" : {}
     }

Logs

All logs from all application and components are collected and centralised in the platform elasticsearch administrative instance. This provides easy and valuable incent on any abnormal behaviour.

In addition all punch application logs have a well defined key-value format.