Introduce gateway_function_invocation_total to track individual functions

Introduce prometheus_alertmanager into stack - have it fire into webhook stash
2025-06-16 12:16:47 +00:00 · 2017-01-21 10:11:33 +00:00 · 2017-01-21 10:11:33 +00:00 · ab2f8e85f3
commit ab2f8e85f3
parent a9e72cd0a3
8 changed files with 118 additions and 8 deletions
--- a/deploy_stack.sh
+++ b/deploy_stack.sh
@ -2,4 +2,3 @@

 echo "Deploying stack"
 docker stack rm func ;  docker stack deploy func --compose-file docker-compose.yml 
-
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -13,16 +13,32 @@ services:
        image: quay.io/prometheus/prometheus:latest
        volumes:
            - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-        command: "-config.file=/etc/prometheus/prometheus.yml -storage.local.path=/prometheus -storage.local.memory-chunks=10000"
+            - ./prometheus/alert.rules:/etc/prometheus/alert.rules
+
+        command: "-config.file=/etc/prometheus/prometheus.yml -storage.local.path=/prometheus -storage.local.memory-chunks=10000 --alertmanager.url=http://alertmanager:9093"
        ports:
            - 9090:9090
        depends_on:
            - gateway
+            - alertmanager
        environment:
            no_proxy:   "gateway"
        networks:
            - functions

+    alertmanager:
+        image: quay.io/prometheus/alertmanager
+        environment:
+            no_proxy:   "gateway"
+        volumes:
+            - ./prometheus/alertmanager.yml:/alertmanager.yml
+        command:
+            - '-config.file=/alertmanager.yml'
+        networks:
+            - functions
+        ports:
+            - 9093:9093
+
    # Sample functions go here.
    webhookstash:
        image: alexellis2/faas-webhookstash:latest
--- a/gateway/build.sh
+++ b/gateway/build.sh
@ -10,4 +10,4 @@ docker rm -f gateway_extract

 echo Building alexellis2/faas-gateway:latest

-docker build -t alexellis2/faas-gateway:latest .
+docker build -t alexellis2/faas-gateway:latest-dev .
--- a/gateway/metrics/metrics.go
+++ b/gateway/metrics/metrics.go
@ -11,6 +11,7 @@ type MetricOptions struct {
 	GatewayRequestsTotal         prometheus.Counter
 	GatewayServerlessServedTotal prometheus.Counter
 	GatewayFunctions             prometheus.Histogram
+	GatewayFunctionInvocation    *prometheus.CounterVec
 }

 // PrometheusHandler Bootstraps prometheus for metrics collection
--- a/gateway/server.go
+++ b/gateway/server.go
@ -72,6 +72,8 @@ func isAlexa(requestBody []byte) AlexaRequestBody {
 }

 func invokeService(w http.ResponseWriter, r *http.Request, metrics metrics.MetricOptions, service string, requestBody []byte) {
+	metrics.GatewayFunctionInvocation.WithLabelValues(service).Add(1)
+
 	stamp := strconv.FormatInt(time.Now().Unix(), 10)

 	start := time.Now()
@ -171,15 +173,24 @@ func main() {
 		Name: "gateway_functions",
 		Help: "Gateway functions",
 	})
+	GatewayFunctionInvocation := prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "gateway_function_invocation_total",
+			Help: "Individual function metrics",
+		},
+		[]string{"function_name"},
+	)

 	prometheus.Register(GatewayRequestsTotal)
 	prometheus.Register(GatewayServerlessServedTotal)
 	prometheus.Register(GatewayFunctions)
+	prometheus.Register(GatewayFunctionInvocation)

 	metricsOptions := metrics.MetricOptions{
 		GatewayRequestsTotal:         GatewayRequestsTotal,
 		GatewayServerlessServedTotal: GatewayServerlessServedTotal,
 		GatewayFunctions:             GatewayFunctions,
+		GatewayFunctionInvocation:    GatewayFunctionInvocation,
 	}

 	r := mux.NewRouter()
--- a/prometheus/alert.rules
+++ b/prometheus/alert.rules
@ -0,0 +1,15 @@
+ALERT service_down
+  IF up == 0
+
+ALERT APIHighInvocationRate
+  IF rate ( gateway_function_invocation_total [10s] ) > 5
+  FOR 30s
+  ANNOTATIONS {
+    summary = "High invocation total on {{ $labels.instance }}",
+    description =  "High invocation total on {{ $labels.instance }}",
+  }
+  LABELS {
+    service = "gateway",
+    severity = "major",
+    value = "{{$value}}",
+  }
--- a/prometheus/alertmanager.yml
+++ b/prometheus/alertmanager.yml
@ -0,0 +1,68 @@
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@example.org'
+  smtp_auth_username: 'alertmanager'
+  smtp_auth_password: 'password'
+  # The auth token for Hipchat.
+  hipchat_auth_token: '1234556789'
+  # Alternative host for Hipchat.
+  hipchat_url: 'https://hipchat.foobar.org/'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h 
+
+  # A default receiver
+  receiver: scale-up
+
+  # All the above attributes are inherited by all child routes and can 
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+  - match:
+    service: gateway
+    receiver: scale-up
+    severity: major
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+receivers:
+- name: 'scale-up'
+  webhook_configs:
+    - url: http://gateway:8080/function/func_webhookstash
+      send_resolved: true
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@ -7,12 +7,12 @@ global:
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
-      monitor: 'codelab-monitor'
+      monitor: 'faas-monitor'

 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
-  # - "first.rules"
-  # - "second.rules"
+    - 'alert.rules'
+

 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
@ -29,6 +29,6 @@ scrape_configs:
      - targets: ['localhost:9090']

  - job_name: "gateway"
-    scrape_interval: "15s"
+    scrape_interval: 5s
    static_configs:
      - targets: ['gateway:8080']