Skip to content

Audit

Centralize configuration

Git

Each user has a copy of the configuration in his operator environment. He can make change locally, test and then synchronize the platform configuration.

To manage conflicts, all the punch configuration is tracked by git.

description:

Git is a version-control system for tracking changes in computer files and coordinating work on those files among multiple people. It is primarily used for source-code management in software development, but it can be used to keep track of changes in any set of files. As a distributed revision-control system, it is aimed at speed, data integrity, and support for distributed, non-linear workflows.

Zookeeper

To monitor the health of the platform, it's crucial to have the last running configuration. For instance: how many channels are running ? How many nodes are deployed in the elasticsearch cluster ?

That's why the punch admin command API such as punchplatform-channel.sh and punchplatform-services.sh use zookeeper to store the last running configuration

technical:

To read the last running configuration:

1
2
3
4
punchplatform-zookeeper-console.sh
# then go the punchplatform root path, on standalone it's punchplatform-primary
[zk: server4:2181(CONNECTED) 0] ls /punchplatform-livedemo/conf
[tenants, resources, punchplatform.properties]

Centralize monitoring

Platform monitoring

If deployed, shiva components monitor the full punch platform stack. An internal admin task runs periodically and stores the result of the monitoring check in an elasticsearch cluster (often dedicated for metrics).

example of platform:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
{
    "@timestamp": "2018-11-02T14:11:29.195Z",
    "storm": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "main": {
          "nimbus": {
            "hosts": {
              "server4": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "port": 6627,
                  "host": "server4",
                  "nimbusLogLink": "http://server4:8000/daemonlog?file=nimbus.log",
                  "nimbusUpTime": "5d 17h 27m 35s",
                  "nimbusUpTimeSeconds": 494855,
                  "version": "1.1.1",
                  "status": "Leader"
                }
              }
            }
          },
          "cluster": {
            "health_code": 0,
            "health_name": "green",
            "details": {
              "stormVersion": "1.1.1",
              "executorsTotal": 22,
              "totalMem": 12288,
              "availCpu": 200,
              "slotsTotal": 20,
              "bugtracker-url": "null",
              "slotsUsed": 6,
              "topologies": 6,
              "totalCpu": 200,
              "cpuAssignedPercentUtil": "0.0",
              "availMem": 10648,
              "name": "main",
              "slotsFree": 14,
              "memAssignedPercentUtil": "13.3",
              "central-log-url": "null",
              "user": "null",
              "tasksTotal": 22,
              "schedulerDisplayResource": false,
              "supervisors": 2
            }
          },
          "health_code": 0,
          "health_name": "green",
          "supervisor": {
            "hosts": {
              "server6": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "logLink": "http://server6:8000/daemonlog?file=supervisor.log",
                  "totalMem": 6144,
                  "availCpu": 100,
                  "usedMem": 678,
                  "slotsTotal": 10,
                  "slotsUsed": 2,
                  "version": "1.1.1",
                  "uptimeSeconds": 495099,
                  "uptime": "5d 17h 31m 39s",
                  "totalCpu": 100,
                  "availMem": 5466,
                  "host": "server6",
                  "slotsFree": 8,
                  "id": "c026c4ff-23ce-4094-a229-284e036d0ec2",
                  "usedCpu": 0
                }
              },
              "server4": {
                "health_code": 0,
                "health_name": "green",
                "details": {
                  "logLink": "http://server4:8000/daemonlog?file=supervisor.log",
                  "totalMem": 6144,
                  "availCpu": 100,
                  "usedMem": 962,
                  "slotsTotal": 10,
                  "slotsUsed": 4,
                  "version": "1.1.1",
                  "uptimeSeconds": 495074,
                  "uptime": "5d 17h 31m 14s",
                  "totalCpu": 100,
                  "availMem": 5182,
                  "host": "server4",
                  "slotsFree": 6,
                  "id": "276af295-14b7-4de1-84f6-0611a39a482e",
                  "usedCpu": 0
                }
              }
            }
          }
        }
      }
    },
    "elasticsearch": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "es_search": {
          "health_code": 0,
          "health_name": "green",
          "details": {
            "number_of_pending_tasks": 0,
            "cluster_name": "es_search",
            "active_shards": 538,
            "active_primary_shards": 269,
            "unassigned_shards": 0,
            "delayed_unassigned_shards": 0,
            "timed_out": false,
            "relocating_shards": 0,
            "initializing_shards": 0,
            "task_max_waiting_in_queue_millis": 0,
            "number_of_data_nodes": 3,
            "number_of_in_flight_fetch": 0,
            "active_shards_percent_as_number": 100,
            "status": "green",
            "number_of_nodes": 3
          }
        }
      }
    },
    "zookeeper": {
      "health_code": 0,
      "health_name": "green",
      "clusters": {
        "common": {
          "health_code": 0,
          "hosts": {
            "server6": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_packets_sent": 9074725,
                "zk_max_latency": 16859,
                "zk_server_state": "follower",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459895,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 9,
                "zk_znode_count": 1142,
                "zk_watch_count": 5290,
                "zk_open_file_descriptor_count": 49,
                "zk_packets_received": 7964841,
                "zk_num_alive_connections": 9
              }
            },
            "server5": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_followers": 2,
                "zk_packets_sent": 283129945,
                "zk_pending_syncs": 0,
                "zk_synced_followers": 2,
                "zk_max_latency": 669680,
                "zk_server_state": "leader",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459898,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 14,
                "zk_znode_count": 1142,
                "zk_watch_count": 159332,
                "zk_open_file_descriptor_count": 50,
                "zk_packets_received": 278374614,
                "zk_num_alive_connections": 8
              }
            },
            "server4": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "zk_packets_sent": 27723571,
                "zk_max_latency": 344801,
                "zk_server_state": "follower",
                "zk_outstanding_requests": 0,
                "zk_ephemerals_count": 33,
                "zk_min_latency": 0,
                "zk_max_file_descriptor_count": 100000,
                "zk_approximate_data_size": 3459898,
                "zk_version": "3.4.10-39d3a4f269333c922ed3db283be479f9deacaa0f,",
                "zk_avg_latency": 20,
                "zk_znode_count": 1142,
                "zk_watch_count": 85213,
                "zk_open_file_descriptor_count": 50,
                "zk_packets_received": 26959577,
                "zk_num_alive_connections": 10
              }
            }
          },
          "health_name": "green"
        }
      }
    },
    "kafka": {
      "health_code": 2,
      "health_name": "red",
      "clusters": {
        "local": {
          "brokers": {
            "1": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "listener_security_protocol_map": {
                  "PLAINTEXT": "PLAINTEXT"
                },
                "endpoints": [
                  "PLAINTEXT://server4:9092"
                ],
                "jmx_port": -1,
                "port": 9092,
                "host": "server4",
                "id": "1",
                "version": 4,
                "timestamp": "1539681285784"
              }
            },
            "2": {
              "health_code": 0,
              "health_name": "green",
              "details": {
                "listener_security_protocol_map": {
                  "PLAINTEXT": "PLAINTEXT"
                },
                "endpoints": [
                  "PLAINTEXT://server5:9092"
                ],
                "jmx_port": -1,
                "port": 9092,
                "host": "server5",
                "id": "2",
                "version": 4,
                "timestamp": "1534301326278"
              }
            },
            "3": {
              "health_code": 2,
              "health_name": "red",
              "details": {
                "id": "3"
              }
            }
          },
          "health_code": 2,
          "health_name": "red"
        }
      }
    },
    "shiva": {
      "health_code": 1,
      "health_name": "yellow",
      "clusters": {
        "shivamain": {
          "health_code": 1,
          "health_name": "yellow",
          "details": {
            "cluster_name": "shivamain",
            "cluster": {
              "election_timestamp": "2018-10-06T18:54:46.869Z",
              "assignements": {
                "25838@server4": [
                  "/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/elasticsearch_housekeeping_service",
                  "/shiva/shivamain/tasks/punchplatform/mytenant/services/admin/kafka_service",
                  "/shiva/shivamain/tasks/punchplatform/platform/services/admin/elasticsearch_housekeeping_service"
                ]
              },
              "leader_id": "25838@server4",
              "unassigned_tasks": {}
            }
          }
        }
      }
    },
    "platform": {
      "health_code": 2,
      "health_name": "red"
    }
  }

Runtime monitoring

Each running component published metrics or logs in an elasticsearch. This allow a user to track stop/start activities or configuration changes. For instance, each component published uptime metric.

  • platform-<date>: Contains the platform stack metrics status

  • shiva-logs-<date>: Contains the logs (STDOUT & STDIN) of the tasks running in shiva

  • punchplatform: Full status of the platform

  • metricbeat-<version>-<date>: Contains system metrics such as memory, CPU, storage

  • <tenant>-metrics-<date>: Contains the runtime metrics of storm topologies

Tracability

Git history

With git, all the past changes are stored.

At anytime, the history is accessible:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
git log 
commit f4155ebb3a2524b4e474da6654689f9c8005ee74
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Mon Oct 15 18:07:30 2018 +0200

    update ais

commit 78b9a271873c62fcfb17a468c2acf876133f13a8
Author: Loic jardin <loic.jardin@thalesgroup.com>
Date:   Fri Oct 5 15:18:45 2018 +0200

    rollback change

commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Thu Oct 4 14:52:10 2018 +0200

    add operator cedric on livedemo

And to have the full changes:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
git log -p 
commit b65114642c877be96a20d89dba9669d5794d36d1
Author: Loic JARDIN <loic.jardin@thalesgroup.com>
Date:   Thu Oct 4 14:52:10 2018 +0200

    add operator cedric on livedemo

diff --git a/platform/livedemo/generated_inventory/group_vars/git_servers b/platform/livedemo/generated_inventory/group_vars/git_servers
index b488ef7..38c3b04 100644
--- a/platform/livedemo/generated_inventory/group_vars/git_servers
+++ b/platform/livedemo/generated_inventory/group_vars/git_servers
@@ -22,4 +22,6 @@ key_to_copy:

  - /tmp/rsa_pub/server4/home/sgrah/.ssh/id_rsa.pub

+ - /tmp/rsa_pub/server4/home/cedric/.ssh/id_rsa.pub
+

diff --git a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
index 9726d0f..94bebc1 100644
--- a/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
+++ b/platform/livedemo/generated_inventory/group_vars/punchplatform_operator_servers
@@ -38,6 +38,7 @@ operators_username:
  - punchplatform
  - rpenco
  - sgrah
+ - cedric


 ####### ADMIN USER & GROUP ##########
diff --git a/platform/livedemo/livedemo-punchplatform-deployment.settings b/platform/livedemo/livedemo-punchplatform-deployment.settings
index 54b850c..9d29a8e 100644
--- a/platform/livedemo/livedemo-punchplatform-deployment.settings
+++ b/platform/livedemo/livedemo-punchplatform-deployment.settings
@@ -26,7 +26,7 @@

 "punchplatform_operator" : {
     "configuration_name_dir_from_home" : "pp-conf-livedemo",
-    "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah"],
+    "operators_username" : ["loic","qmarinie","leo","dimi","obarbier","punchplatform","rpenco","sgrah","cedric"],
     "servers" : {
         "server4" : {}
     }

Apache Storm

Storm logs contain the user and the topology name submitted.

example:

Here mytenant_ufw_processing topology has been submitted by the user loic

1
2018-10-18 10:16:50.093 o.a.s.d.nimbus [INFO] Setting new assignment for topology id mytenant_ufw_processing-10-1535456322: #org.apache.storm.daemon.common.Assignment{:master-code-dir "/data/storm", :node->host {"c026c4ff-23ce-4094-a229-284e036d0ec2" "server6"}, :executor->node+port {[4 4] ["c026c4ff-23ce-4094-a229-284e036d0ec2" 6707], [3 3] ["c026c4ff-23ce-4094-a229-284e036d0ec2" 6707], [2 2] ["c026c4ff-23ce-4094-a229-284e036d0ec2" 6707], [1 1] ["c026c4ff-23ce-4094-a229-284e036d0ec2" 6707]}, :executor->start-time-secs {[1 1] 1539850610, [2 2] 1539850610, [3 3] 1539850610, [4 4] 1539850610}, :worker->resources {["c026c4ff-23ce-4094-a229-284e036d0ec2" 6707] [0.0 0.0 0.0]}, :owner "loic"}