Alter graceful shutdown sequence

- the shutdown sequence meant that the kubelet was still passing
work to the watchdog after the HTTP socket was closed. This change
means that the kubelet has a chance to run its check before we
finally stop accepting new connections. It will require some
basic co-ordination between the kubelet's checking period and the
"write_timeout" value in the container.

Tested with Kubernetes on GKE - before the change some Pods were
giving a connection refused error due to them being not detected
as unhealthy. Now I receive 0% error rate even with 20 qps.

Issue was shown by scaling to 20 replicas, starting a test with
hey and then scaling to 1 replica while tailing the logs from the
gateway. Before I saw some 502, now I see just 200s.

Signed-off-by: Alex Ellis (VMware) <alexellis2@gmail.com>
This commit is contained in:
Alex Ellis (VMware)
2018-09-17 11:35:57 +01:00
committed by Alex Ellis
parent d9f33435f0
commit e67811c91c
5 changed files with 70 additions and 28 deletions

View File

@ -11,25 +11,30 @@ import (
"net/http"
"os"
"os/signal"
"path/filepath"
"sync/atomic"
"syscall"
"time"
"github.com/openfaas/faas/watchdog/types"
)
var version bool
var (
versionFlag bool
acceptingConnections int32
)
func main() {
flag.BoolVar(&version, "version", false, "Print the version and exit")
flag.BoolVar(&versionFlag, "version", false, "Print the version and exit")
flag.Parse()
printVersion()
if version == true {
if versionFlag {
return
}
acceptingConnections = false
atomic.StoreInt32(&acceptingConnections, 0)
osEnv := types.OsEnv{}
readConfig := ReadConfig{}
@ -50,24 +55,25 @@ func main() {
MaxHeaderBytes: 1 << 20, // Max header of 1MB
}
log.Printf("Read/write timeout: %s, %s. Port: %d\n", readTimeout, writeTimeout, config.port)
http.HandleFunc("/_/health", makeHealthHandler())
http.HandleFunc("/", makeRequestHandler(&config))
if config.suppressLock == false {
path, writeErr := createLockFile()
shutdownTimeout := config.writeTimeout
if writeErr != nil {
log.Panicf("Cannot write %s. To disable lock-file set env suppress_lock=true.\n Error: %s.\n", path, writeErr.Error())
}
} else {
log.Println("Warning: \"suppress_lock\" is enabled. No automated health-checks will be in place for your function.")
acceptingConnections = true
}
listenUntilShutdown(config.writeTimeout, s)
listenUntilShutdown(shutdownTimeout, s, config.suppressLock)
}
func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server) {
func markUnhealthy() error {
atomic.StoreInt32(&acceptingConnections, 0)
path := filepath.Join(os.TempDir(), ".lock")
log.Printf("Removing lock-file : %s\n", path)
removeErr := os.Remove(path)
return removeErr
}
func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server, suppressLock bool) {
idleConnsClosed := make(chan struct{})
go func() {
@ -76,23 +82,46 @@ func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server) {
<-sig
log.Printf("SIGTERM received.. shutting down server")
log.Printf("SIGTERM received.. shutting down server in %s\n", shutdownTimeout.String())
acceptingConnections = false
healthErr := markUnhealthy()
if healthErr != nil {
log.Printf("Unable to mark unhealthy during shutdown: %s\n", healthErr.Error())
}
<-time.Tick(shutdownTimeout)
if err := s.Shutdown(context.Background()); err != nil {
// Error from closing listeners, or context timeout:
log.Printf("Error in Shutdown: %v", err)
}
log.Printf("No new connections allowed. Exiting in: %s\n", shutdownTimeout.String())
<-time.Tick(shutdownTimeout)
close(idleConnsClosed)
}()
if err := s.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Error ListenAndServe: %v", err)
close(idleConnsClosed)
// Run the HTTP server in a separate go-routine.
go func() {
if err := s.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Error ListenAndServe: %v", err)
close(idleConnsClosed)
}
}()
if suppressLock == false {
path, writeErr := createLockFile()
if writeErr != nil {
log.Panicf("Cannot write %s. To disable lock-file set env suppress_lock=true.\n Error: %s.\n", path, writeErr.Error())
}
} else {
log.Println("Warning: \"suppress_lock\" is enabled. No automated health-checks will be in place for your function.")
atomic.StoreInt32(&acceptingConnections, 1)
}
<-idleConnsClosed