From eaa816da4ee56d91b4feeec4db5028c94dd4cb06 Mon Sep 17 00:00:00 2001 From: Knut Ahlers Date: Mon, 6 Jun 2016 16:33:45 +0200 Subject: [PATCH] Expose metrics about checks for prometheus --- elb-instance-status.yml | 25 ++++++++++++--------- main.go | 38 ++++++++++++++++++++----------- metrics.go | 50 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 23 deletions(-) create mode 100644 metrics.go diff --git a/elb-instance-status.yml b/elb-instance-status.yml index a6c18b4..6c10376 100644 --- a/elb-instance-status.yml +++ b/elb-instance-status.yml @@ -1,16 +1,21 @@ --- -- name: Ensure there are at least 30% free inodes on / +root_free_inodes: + name: Ensure there are at least 30% free inodes on / command: test $(df -i | grep "/$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 - -- name: Ensure there are at least 30% free inodes on /var/lib/docker + +docker_free_inodes: + name: Ensure there are at least 30% free inodes on /var/lib/docker command: test $(df -i | grep "/var/lib/docker$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 - -- name: Ensure there is at least 30% free disk space on /var/lib/docker + +docker_free_diskspace: + name: Ensure there is at least 30% free disk space on /var/lib/docker command: test $(df | grep "/var/lib/docker$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 - -- name: Ensure volume on /var/lib/docker is mounted + +docker_mounted: + name: Ensure volume on /var/lib/docker is mounted command: mount | grep -q /var/lib/docker - -- name: Ensure docker can start a small container - command: docker run --rm alpine /bin/sh -c "echo testing123" | grep -q testing123 \ No newline at end of file + +docker_start_container: + name: Ensure docker can start a small container + command: docker run --rm alpine /bin/sh -c "echo testing123" | grep -q testing123 diff --git a/main.go b/main.go index 3810e24..4c105d4 100644 --- a/main.go +++ b/main.go @@ -17,6 +17,7 @@ import ( "github.com/Luzifer/rconfig" "github.com/gorilla/mux" + "github.com/prometheus/client_golang/prometheus" "github.com/robfig/cron" ) @@ -30,7 +31,7 @@ var ( version = "dev" - checks = []checkCommand{} + checks = map[string]checkCommand{} checkResults = map[string]*checkResult{} checkResultsLock sync.RWMutex lastResultRegistered time.Time @@ -78,17 +79,19 @@ func main() { r := mux.NewRouter() r.HandleFunc("/status", handleELBHealthCheck) + r.Handle("/metrics", prometheus.Handler()) http.ListenAndServe(cfg.Listen, r) } func spawnChecks() { - for i := range checks { - go executeAndRegisterCheck(i) + for id := range checks { + go executeAndRegisterCheck(id) } } -func executeAndRegisterCheck(checkIndex int) { - check := checks[checkIndex] +func executeAndRegisterCheck(checkID string) { + check := checks[checkID] + start := time.Now() cmd := exec.Command("/bin/bash", "-c", check.Command) err := cmd.Run() @@ -97,21 +100,28 @@ func executeAndRegisterCheck(checkIndex int) { checkResultsLock.Lock() - if _, ok := checkResults[check.Name]; !ok { - checkResults[check.Name] = &checkResult{ + if _, ok := checkResults[checkID]; !ok { + checkResults[checkID] = &checkResult{ Check: check, } } - if success == checkResults[check.Name].IsSuccess { - checkResults[check.Name].Streak++ + if success == checkResults[checkID].IsSuccess { + checkResults[checkID].Streak++ } else { - checkResults[check.Name].IsSuccess = success - checkResults[check.Name].Streak = 1 + checkResults[checkID].IsSuccess = success + checkResults[checkID].Streak = 1 } lastResultRegistered = time.Now() + if success { + checkPassing.WithLabelValues(checkID).Set(1) + } else { + checkPassing.WithLabelValues(checkID).Set(0) + } + checkExecutionTime.WithLabelValues(checkID).Observe(float64(time.Since(start).Nanoseconds()) / float64(time.Microsecond)) + checkResultsLock.Unlock() } @@ -121,7 +131,7 @@ func handleELBHealthCheck(res http.ResponseWriter, r *http.Request) { buf := bytes.NewBuffer([]byte{}) checkResultsLock.RLock() - for cn, cr := range checkResults { + for _, cr := range checkResults { state := "" switch { case cr.IsSuccess: @@ -134,15 +144,17 @@ func handleELBHealthCheck(res http.ResponseWriter, r *http.Request) { state = "CRIT" healthy = false } - fmt.Fprintf(buf, "[%s] %s\n", state, cn) + fmt.Fprintf(buf, "[%s] %s\n", state, cr.Check.Name) } checkResultsLock.RUnlock() res.Header().Set("X-Collection-Parsed-In", strconv.FormatInt(time.Since(start).Nanoseconds()/int64(time.Microsecond), 10)+"ms") res.Header().Set("X-Last-Result-Registered-At", lastResultRegistered.Format(time.RFC1123)) if healthy { + currentStatusCode.Set(http.StatusOK) res.WriteHeader(http.StatusOK) } else { + currentStatusCode.Set(http.StatusInternalServerError) res.WriteHeader(http.StatusInternalServerError) } diff --git a/metrics.go b/metrics.go new file mode 100644 index 0000000..7059664 --- /dev/null +++ b/metrics.go @@ -0,0 +1,50 @@ +package main + +import ( + "log" + "os" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + checkPassing *prometheus.GaugeVec + checkExecutionTime *prometheus.SummaryVec + currentStatusCode prometheus.Gauge + + dynamicLabels = []string{"check_id"} +) + +func init() { + hostname, err := os.Hostname() + if err != nil { + log.Fatalf("Unable to determine own hostname: %s", err) + } + + co := prometheus.GaugeOpts{ + Subsystem: "elb_instance_status", + ConstLabels: prometheus.Labels{"hostname": hostname}, + } + + co.Name = "check_passing" + co.Help = "Bit showing whether the check PASSed (=1) or FAILed (=0), WARNs are also reported as FAILs" + + cp := prometheus.NewGaugeVec(co, dynamicLabels) + + co.Name = "status_code" + co.Help = "Contains the current HTTP status code the ELB is seeing" + + csc := prometheus.NewGauge(co) + + cet := prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Namespace: co.Namespace, + Subsystem: co.Subsystem, + ConstLabels: co.ConstLabels, + Name: "check_execution_time", + Help: "Timespan in µs the execution of the check took", + }, dynamicLabels) + + checkPassing = prometheus.MustRegisterOrGet(cp).(*prometheus.GaugeVec) + currentStatusCode = prometheus.MustRegisterOrGet(csc).(prometheus.Gauge) + checkExecutionTime = prometheus.MustRegisterOrGet(cet).(*prometheus.SummaryVec) +}