mirror of
https://github.com/openfaas/faas.git
synced 2025-06-11 09:46:48 +00:00
Alter graceful shutdown sequence
- the shutdown sequence meant that the kubelet was still passing work to the watchdog after the HTTP socket was closed. This change means that the kubelet has a chance to run its check before we finally stop accepting new connections. It will require some basic co-ordination between the kubelet's checking period and the "write_timeout" value in the container. Tested with Kubernetes on GKE - before the change some Pods were giving a connection refused error due to them being not detected as unhealthy. Now I receive 0% error rate even with 20 qps. Issue was shown by scaling to 20 replicas, starting a test with hey and then scaling to 1 replica while tailing the logs from the gateway. Before I saw some 502, now I see just 200s. Signed-off-by: Alex Ellis (VMware) <alexellis2@gmail.com>
This commit is contained in:
parent
d9f33435f0
commit
e67811c91c
@ -111,6 +111,16 @@ A new version of the watchdog is being tested over at [openfaas-incubator/of-wat
|
|||||||
|
|
||||||
This re-write is mainly structural for on-going maintenance. It will be a drop-in replacement for the existing watchdog and also has binary releases available.
|
This re-write is mainly structural for on-going maintenance. It will be a drop-in replacement for the existing watchdog and also has binary releases available.
|
||||||
|
|
||||||
|
### Graceful shutdowns
|
||||||
|
|
||||||
|
The watchdog is capable of working with health-checks to provide a graceful shutdown.
|
||||||
|
|
||||||
|
When a `SIGTERM` signal is detected within the watchdog process a Go routine will remove the `/tmp/.lock` file and mark the HTTP health-check as unhealthy and return HTTP 503. The code will then wait for the duration specified in `write_timeout`. During this window the container-orchestrator's health-check must run and complete.
|
||||||
|
|
||||||
|
Now the orchestrator will mark this replica as unhealthy and remove it from the pool of valid HTTP endpoints.
|
||||||
|
|
||||||
|
Now we will stop accepting new connections and wait for the value defined in `write_timeout` before finally allowing the process to exit.
|
||||||
|
|
||||||
### Working with HTTP headers
|
### Working with HTTP headers
|
||||||
|
|
||||||
Headers and other request information are injected into environmental variables in the following format:
|
Headers and other request information are injected into environmental variables in the following format:
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#!/bin/sh
|
#!/bin/bash
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
export arch=$(uname -m)
|
export arch=$(uname -m)
|
||||||
|
|
||||||
if [ "$arch" = "armv7l" ] ; then
|
if [ "$arch" = "armv7l" ] ; then
|
||||||
|
@ -14,13 +14,12 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/openfaas/faas/watchdog/types"
|
"github.com/openfaas/faas/watchdog/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
var acceptingConnections bool
|
|
||||||
|
|
||||||
// buildFunctionInput for a GET method this is an empty byte array.
|
// buildFunctionInput for a GET method this is an empty byte array.
|
||||||
func buildFunctionInput(config *WatchdogConfig, r *http.Request) ([]byte, error) {
|
func buildFunctionInput(config *WatchdogConfig, r *http.Request) ([]byte, error) {
|
||||||
var res []byte
|
var res []byte
|
||||||
@ -270,7 +269,8 @@ func createLockFile() (string, error) {
|
|||||||
path := filepath.Join(os.TempDir(), ".lock")
|
path := filepath.Join(os.TempDir(), ".lock")
|
||||||
log.Printf("Writing lock-file to: %s\n", path)
|
log.Printf("Writing lock-file to: %s\n", path)
|
||||||
writeErr := ioutil.WriteFile(path, []byte{}, 0660)
|
writeErr := ioutil.WriteFile(path, []byte{}, 0660)
|
||||||
acceptingConnections = true
|
|
||||||
|
atomic.StoreInt32(&acceptingConnections, 1)
|
||||||
|
|
||||||
return path, writeErr
|
return path, writeErr
|
||||||
}
|
}
|
||||||
@ -279,13 +279,14 @@ func makeHealthHandler() func(http.ResponseWriter, *http.Request) {
|
|||||||
return func(w http.ResponseWriter, r *http.Request) {
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
switch r.Method {
|
switch r.Method {
|
||||||
case http.MethodGet:
|
case http.MethodGet:
|
||||||
if acceptingConnections == false || lockFilePresent() == false {
|
if atomic.LoadInt32(&acceptingConnections) == 0 || lockFilePresent() == false {
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
w.WriteHeader(http.StatusOK)
|
w.WriteHeader(http.StatusOK)
|
||||||
w.Write([]byte("OK"))
|
w.Write([]byte("OK"))
|
||||||
|
|
||||||
break
|
break
|
||||||
default:
|
default:
|
||||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||||
|
@ -11,25 +11,30 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/openfaas/faas/watchdog/types"
|
"github.com/openfaas/faas/watchdog/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
var version bool
|
var (
|
||||||
|
versionFlag bool
|
||||||
|
acceptingConnections int32
|
||||||
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
flag.BoolVar(&version, "version", false, "Print the version and exit")
|
flag.BoolVar(&versionFlag, "version", false, "Print the version and exit")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
printVersion()
|
printVersion()
|
||||||
|
|
||||||
if version == true {
|
if versionFlag {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
acceptingConnections = false
|
atomic.StoreInt32(&acceptingConnections, 0)
|
||||||
|
|
||||||
osEnv := types.OsEnv{}
|
osEnv := types.OsEnv{}
|
||||||
readConfig := ReadConfig{}
|
readConfig := ReadConfig{}
|
||||||
@ -50,24 +55,25 @@ func main() {
|
|||||||
MaxHeaderBytes: 1 << 20, // Max header of 1MB
|
MaxHeaderBytes: 1 << 20, // Max header of 1MB
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Printf("Read/write timeout: %s, %s. Port: %d\n", readTimeout, writeTimeout, config.port)
|
||||||
http.HandleFunc("/_/health", makeHealthHandler())
|
http.HandleFunc("/_/health", makeHealthHandler())
|
||||||
http.HandleFunc("/", makeRequestHandler(&config))
|
http.HandleFunc("/", makeRequestHandler(&config))
|
||||||
|
|
||||||
if config.suppressLock == false {
|
shutdownTimeout := config.writeTimeout
|
||||||
path, writeErr := createLockFile()
|
|
||||||
|
|
||||||
if writeErr != nil {
|
listenUntilShutdown(shutdownTimeout, s, config.suppressLock)
|
||||||
log.Panicf("Cannot write %s. To disable lock-file set env suppress_lock=true.\n Error: %s.\n", path, writeErr.Error())
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Println("Warning: \"suppress_lock\" is enabled. No automated health-checks will be in place for your function.")
|
|
||||||
acceptingConnections = true
|
|
||||||
}
|
|
||||||
|
|
||||||
listenUntilShutdown(config.writeTimeout, s)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server) {
|
func markUnhealthy() error {
|
||||||
|
atomic.StoreInt32(&acceptingConnections, 0)
|
||||||
|
|
||||||
|
path := filepath.Join(os.TempDir(), ".lock")
|
||||||
|
log.Printf("Removing lock-file : %s\n", path)
|
||||||
|
removeErr := os.Remove(path)
|
||||||
|
return removeErr
|
||||||
|
}
|
||||||
|
|
||||||
|
func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server, suppressLock bool) {
|
||||||
|
|
||||||
idleConnsClosed := make(chan struct{})
|
idleConnsClosed := make(chan struct{})
|
||||||
go func() {
|
go func() {
|
||||||
@ -76,23 +82,46 @@ func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server) {
|
|||||||
|
|
||||||
<-sig
|
<-sig
|
||||||
|
|
||||||
log.Printf("SIGTERM received.. shutting down server")
|
log.Printf("SIGTERM received.. shutting down server in %s\n", shutdownTimeout.String())
|
||||||
|
|
||||||
acceptingConnections = false
|
healthErr := markUnhealthy()
|
||||||
|
|
||||||
|
if healthErr != nil {
|
||||||
|
log.Printf("Unable to mark unhealthy during shutdown: %s\n", healthErr.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
<-time.Tick(shutdownTimeout)
|
||||||
|
|
||||||
if err := s.Shutdown(context.Background()); err != nil {
|
if err := s.Shutdown(context.Background()); err != nil {
|
||||||
// Error from closing listeners, or context timeout:
|
// Error from closing listeners, or context timeout:
|
||||||
log.Printf("Error in Shutdown: %v", err)
|
log.Printf("Error in Shutdown: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Printf("No new connections allowed. Exiting in: %s\n", shutdownTimeout.String())
|
||||||
|
|
||||||
<-time.Tick(shutdownTimeout)
|
<-time.Tick(shutdownTimeout)
|
||||||
|
|
||||||
close(idleConnsClosed)
|
close(idleConnsClosed)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if err := s.ListenAndServe(); err != http.ErrServerClosed {
|
// Run the HTTP server in a separate go-routine.
|
||||||
log.Printf("Error ListenAndServe: %v", err)
|
go func() {
|
||||||
close(idleConnsClosed)
|
if err := s.ListenAndServe(); err != http.ErrServerClosed {
|
||||||
|
log.Printf("Error ListenAndServe: %v", err)
|
||||||
|
close(idleConnsClosed)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if suppressLock == false {
|
||||||
|
path, writeErr := createLockFile()
|
||||||
|
|
||||||
|
if writeErr != nil {
|
||||||
|
log.Panicf("Cannot write %s. To disable lock-file set env suppress_lock=true.\n Error: %s.\n", path, writeErr.Error())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Println("Warning: \"suppress_lock\" is enabled. No automated health-checks will be in place for your function.")
|
||||||
|
|
||||||
|
atomic.StoreInt32(&acceptingConnections, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
<-idleConnsClosed
|
<-idleConnsClosed
|
||||||
|
@ -463,7 +463,7 @@ func TestHealthHandler_StatusInternalServerError_LockFileNotPresent(t *testing.T
|
|||||||
handler := makeHealthHandler()
|
handler := makeHealthHandler()
|
||||||
handler(rr, req)
|
handler(rr, req)
|
||||||
|
|
||||||
required := http.StatusInternalServerError
|
required := http.StatusServiceUnavailable
|
||||||
if status := rr.Code; status != required {
|
if status := rr.Code; status != required {
|
||||||
t.Errorf("handler returned wrong status code - got: %v, want: %v", status, required)
|
t.Errorf("handler returned wrong status code - got: %v, want: %v", status, required)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user