2016-06-03 10:55:19 +00:00
package main
import (
"bytes"
2016-07-22 09:22:29 +00:00
"errors"
2016-06-03 10:55:19 +00:00
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
2016-07-22 09:22:29 +00:00
"net/url"
2016-06-03 10:55:19 +00:00
"os"
"os/exec"
"strconv"
"sync"
"time"
"github.com/Luzifer/rconfig"
"github.com/gorilla/mux"
2016-06-06 14:33:45 +00:00
"github.com/prometheus/client_golang/prometheus"
2016-06-03 10:55:19 +00:00
"github.com/robfig/cron"
2016-07-22 09:35:24 +00:00
"golang.org/x/net/context"
"gopkg.in/yaml.v2"
2016-06-03 10:55:19 +00:00
)
var (
cfg = struct {
2016-07-22 09:22:29 +00:00
CheckDefinitionsFile string ` flag:"check-definitions-file,c" default:"/etc/elb-instance-status.yml" description:"File or URL containing checks to perform for instance health" `
2016-06-03 10:55:19 +00:00
UnhealthyThreshold int64 ` flag:"unhealthy-threshold" default:"5" description:"How often does a check have to fail to mark the machine unhealthy" `
2016-07-22 11:18:10 +00:00
CheckInterval time . Duration ` flag:"check-interval" default:"1m" description:"How often to execute checks (do not set below 10s!)" `
ConfigRefreshInterval time . Duration ` flag:"config-refresh" default:"10m" description:"How often to update checks from definitions file / url" `
Listen string ` flag:"listen" default:":3000" description:"IP/Port to listen on for ELB health checks" `
VersionAndExit bool ` flag:"version" default:"false" description:"Print version and exit" `
2016-06-03 10:55:19 +00:00
} { }
version = "dev"
2016-07-22 09:22:29 +00:00
checks map [ string ] checkCommand
2016-06-03 10:55:19 +00:00
checkResults = map [ string ] * checkResult { }
checkResultsLock sync . RWMutex
lastResultRegistered time . Time
)
type checkCommand struct {
Name string ` yaml:"name" `
Command string ` yaml:"command" `
WarnOnly bool ` yaml:"warn-only" `
}
type checkResult struct {
Check checkCommand
IsSuccess bool
Streak int64
}
func init ( ) {
rconfig . Parse ( & cfg )
if cfg . VersionAndExit {
fmt . Printf ( "elb-instance-status %s\n" , version )
os . Exit ( 0 )
}
}
func loadChecks ( ) error {
2016-07-22 12:45:18 +00:00
var rawChecks [ ] byte
2016-07-22 09:22:29 +00:00
if _ , err := os . Stat ( cfg . CheckDefinitionsFile ) ; err == nil {
// We got a local file, read it
rawChecks , err = ioutil . ReadFile ( cfg . CheckDefinitionsFile )
if err != nil {
return err
}
} else {
// Check whether we got an URL
if _ , err := url . Parse ( cfg . CheckDefinitionsFile ) ; err != nil {
return errors . New ( "Definitions file is neither a local file nor a URL" )
}
// We got an URL, fetch and read it
resp , err := http . Get ( cfg . CheckDefinitionsFile )
if err != nil {
return err
}
defer resp . Body . Close ( )
rawChecks , err = ioutil . ReadAll ( resp . Body )
if err != nil {
return err
}
}
tmpResult := map [ string ] checkCommand { }
2016-07-22 12:45:18 +00:00
err := yaml . Unmarshal ( rawChecks , & tmpResult )
2016-07-22 09:22:29 +00:00
if err == nil {
checks = tmpResult
2016-06-03 10:55:19 +00:00
}
2016-07-22 09:22:29 +00:00
return err
2016-06-03 10:55:19 +00:00
}
func main ( ) {
if err := loadChecks ( ) ; err != nil {
log . Fatalf ( "Unable to read definitions file: %s" , err )
}
c := cron . New ( )
2016-07-22 11:18:10 +00:00
c . AddFunc ( "@every " + cfg . CheckInterval . String ( ) , spawnChecks )
c . AddFunc ( "@every " + cfg . ConfigRefreshInterval . String ( ) , func ( ) {
2016-07-22 09:22:29 +00:00
if err := loadChecks ( ) ; err != nil {
log . Printf ( "Unable to refresh checks: %s" , err )
}
} )
2016-06-03 10:55:19 +00:00
c . Start ( )
spawnChecks ( )
r := mux . NewRouter ( )
r . HandleFunc ( "/status" , handleELBHealthCheck )
2016-06-06 14:33:45 +00:00
r . Handle ( "/metrics" , prometheus . Handler ( ) )
2016-06-03 10:55:19 +00:00
http . ListenAndServe ( cfg . Listen , r )
}
func spawnChecks ( ) {
2016-07-22 11:18:10 +00:00
ctx , _ := context . WithTimeout ( context . Background ( ) , cfg . CheckInterval - time . Second )
2016-07-22 09:35:24 +00:00
2016-06-06 14:33:45 +00:00
for id := range checks {
2016-07-22 09:35:24 +00:00
go executeAndRegisterCheck ( ctx , id )
2016-06-03 10:55:19 +00:00
}
}
2016-07-22 09:35:24 +00:00
func executeAndRegisterCheck ( ctx context . Context , checkID string ) {
2016-06-06 14:33:45 +00:00
check := checks [ checkID ]
start := time . Now ( )
2016-06-03 10:55:19 +00:00
cmd := exec . Command ( "/bin/bash" , "-c" , check . Command )
2016-07-22 09:35:24 +00:00
err := cmd . Start ( )
if err == nil {
cmdDone := make ( chan error )
go func ( cmdDone chan error , cmd * exec . Cmd ) { cmdDone <- cmd . Wait ( ) } ( cmdDone , cmd )
loop := true
for loop {
select {
case err = <- cmdDone :
loop = false
case <- ctx . Done ( ) :
log . Printf ( "Execution of check '%s' was killed through context timeout." , checkID )
cmd . Process . Kill ( )
time . Sleep ( time . Millisecond )
}
}
}
2016-06-03 10:55:19 +00:00
success := err == nil
checkResultsLock . Lock ( )
2016-06-06 14:33:45 +00:00
if _ , ok := checkResults [ checkID ] ; ! ok {
checkResults [ checkID ] = & checkResult {
2016-06-03 10:55:19 +00:00
Check : check ,
}
}
2016-06-06 14:33:45 +00:00
if success == checkResults [ checkID ] . IsSuccess {
checkResults [ checkID ] . Streak ++
2016-06-03 10:55:19 +00:00
} else {
2016-06-06 14:33:45 +00:00
checkResults [ checkID ] . IsSuccess = success
checkResults [ checkID ] . Streak = 1
2016-06-03 10:55:19 +00:00
}
lastResultRegistered = time . Now ( )
2016-06-06 14:33:45 +00:00
if success {
checkPassing . WithLabelValues ( checkID ) . Set ( 1 )
} else {
checkPassing . WithLabelValues ( checkID ) . Set ( 0 )
}
checkExecutionTime . WithLabelValues ( checkID ) . Observe ( float64 ( time . Since ( start ) . Nanoseconds ( ) ) / float64 ( time . Microsecond ) )
2016-06-03 10:55:19 +00:00
checkResultsLock . Unlock ( )
}
func handleELBHealthCheck ( res http . ResponseWriter , r * http . Request ) {
healthy := true
start := time . Now ( )
buf := bytes . NewBuffer ( [ ] byte { } )
checkResultsLock . RLock ( )
2016-06-06 14:33:45 +00:00
for _ , cr := range checkResults {
2016-06-03 10:55:19 +00:00
state := ""
switch {
case cr . IsSuccess :
state = "PASS"
case ! cr . IsSuccess && cr . Check . WarnOnly :
state = "WARN"
case ! cr . IsSuccess && ! cr . Check . WarnOnly && cr . Streak < cfg . UnhealthyThreshold :
state = "CRIT"
case ! cr . IsSuccess && ! cr . Check . WarnOnly && cr . Streak >= cfg . UnhealthyThreshold :
state = "CRIT"
healthy = false
}
2016-06-06 14:33:45 +00:00
fmt . Fprintf ( buf , "[%s] %s\n" , state , cr . Check . Name )
2016-06-03 10:55:19 +00:00
}
checkResultsLock . RUnlock ( )
res . Header ( ) . Set ( "X-Collection-Parsed-In" , strconv . FormatInt ( time . Since ( start ) . Nanoseconds ( ) / int64 ( time . Microsecond ) , 10 ) + "ms" )
res . Header ( ) . Set ( "X-Last-Result-Registered-At" , lastResultRegistered . Format ( time . RFC1123 ) )
if healthy {
2016-06-06 14:33:45 +00:00
currentStatusCode . Set ( http . StatusOK )
2016-06-03 10:55:19 +00:00
res . WriteHeader ( http . StatusOK )
} else {
2016-06-06 14:33:45 +00:00
currentStatusCode . Set ( http . StatusInternalServerError )
2016-06-03 10:55:19 +00:00
res . WriteHeader ( http . StatusInternalServerError )
}
io . Copy ( res , buf )
}