From 9a32299fb160a4b33731071f0f5805be6a34f9ae Mon Sep 17 00:00:00 2001 From: Knut Ahlers Date: Fri, 3 Jun 2016 12:55:19 +0200 Subject: [PATCH] Initital version --- .gitignore | 1 + elb-instance-status.yml | 16 +++++ main.go | 150 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 .gitignore create mode 100644 elb-instance-status.yml create mode 100644 main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5842232 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +elb-instance-status diff --git a/elb-instance-status.yml b/elb-instance-status.yml new file mode 100644 index 0000000..a6c18b4 --- /dev/null +++ b/elb-instance-status.yml @@ -0,0 +1,16 @@ +--- + +- name: Ensure there are at least 30% free inodes on / + command: test $(df -i | grep "/$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 + +- name: Ensure there are at least 30% free inodes on /var/lib/docker + command: test $(df -i | grep "/var/lib/docker$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 + +- name: Ensure there is at least 30% free disk space on /var/lib/docker + command: test $(df | grep "/var/lib/docker$" | xargs | cut -d ' ' -f 5 | sed "s/%//") -lt 70 + +- name: Ensure volume on /var/lib/docker is mounted + command: mount | grep -q /var/lib/docker + +- name: Ensure docker can start a small container + command: docker run --rm alpine /bin/sh -c "echo testing123" | grep -q testing123 \ No newline at end of file diff --git a/main.go b/main.go new file mode 100644 index 0000000..3810e24 --- /dev/null +++ b/main.go @@ -0,0 +1,150 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "os" + "os/exec" + "strconv" + "sync" + "time" + + "gopkg.in/yaml.v2" + + "github.com/Luzifer/rconfig" + "github.com/gorilla/mux" + "github.com/robfig/cron" +) + +var ( + cfg = struct { + CheckDefinitionsFile string `flag:"check-definitions-file,c" default:"/etc/elb-instance-status.yml" description:"File containing checks to perform for instance health"` + UnhealthyThreshold int64 `flag:"unhealthy-threshold" default:"5" description:"How often does a check have to fail to mark the machine unhealthy"` + Listen string `flag:"listen" default:":3000" description:"IP/Port to listen on for ELB health checks"` + VersionAndExit bool `flag:"version" default:"false" description:"Print version and exit"` + }{} + + version = "dev" + + checks = []checkCommand{} + checkResults = map[string]*checkResult{} + checkResultsLock sync.RWMutex + lastResultRegistered time.Time +) + +type checkCommand struct { + Name string `yaml:"name"` + Command string `yaml:"command"` + WarnOnly bool `yaml:"warn-only"` +} + +type checkResult struct { + Check checkCommand + IsSuccess bool + Streak int64 +} + +func init() { + rconfig.Parse(&cfg) + + if cfg.VersionAndExit { + fmt.Printf("elb-instance-status %s\n", version) + os.Exit(0) + } +} + +func loadChecks() error { + rawChecks, err := ioutil.ReadFile(cfg.CheckDefinitionsFile) + if err != nil { + return err + } + return yaml.Unmarshal(rawChecks, &checks) +} + +func main() { + if err := loadChecks(); err != nil { + log.Fatalf("Unable to read definitions file: %s", err) + } + + c := cron.New() + c.AddFunc("@every 1m", spawnChecks) + c.Start() + + spawnChecks() + + r := mux.NewRouter() + r.HandleFunc("/status", handleELBHealthCheck) + http.ListenAndServe(cfg.Listen, r) +} + +func spawnChecks() { + for i := range checks { + go executeAndRegisterCheck(i) + } +} + +func executeAndRegisterCheck(checkIndex int) { + check := checks[checkIndex] + + cmd := exec.Command("/bin/bash", "-c", check.Command) + err := cmd.Run() + + success := err == nil + + checkResultsLock.Lock() + + if _, ok := checkResults[check.Name]; !ok { + checkResults[check.Name] = &checkResult{ + Check: check, + } + } + + if success == checkResults[check.Name].IsSuccess { + checkResults[check.Name].Streak++ + } else { + checkResults[check.Name].IsSuccess = success + checkResults[check.Name].Streak = 1 + } + + lastResultRegistered = time.Now() + + checkResultsLock.Unlock() +} + +func handleELBHealthCheck(res http.ResponseWriter, r *http.Request) { + healthy := true + start := time.Now() + buf := bytes.NewBuffer([]byte{}) + + checkResultsLock.RLock() + for cn, cr := range checkResults { + state := "" + switch { + case cr.IsSuccess: + state = "PASS" + case !cr.IsSuccess && cr.Check.WarnOnly: + state = "WARN" + case !cr.IsSuccess && !cr.Check.WarnOnly && cr.Streak < cfg.UnhealthyThreshold: + state = "CRIT" + case !cr.IsSuccess && !cr.Check.WarnOnly && cr.Streak >= cfg.UnhealthyThreshold: + state = "CRIT" + healthy = false + } + fmt.Fprintf(buf, "[%s] %s\n", state, cn) + } + checkResultsLock.RUnlock() + + res.Header().Set("X-Collection-Parsed-In", strconv.FormatInt(time.Since(start).Nanoseconds()/int64(time.Microsecond), 10)+"ms") + res.Header().Set("X-Last-Result-Registered-At", lastResultRegistered.Format(time.RFC1123)) + if healthy { + res.WriteHeader(http.StatusOK) + } else { + res.WriteHeader(http.StatusInternalServerError) + } + + io.Copy(res, buf) +}