From 3bcc10a07e264684d81355ea08a7403673b76bb1 Mon Sep 17 00:00:00 2001
From: Lucas Roesler <roesler.lucas@gmail.com>
Date: Thu, 21 Feb 2019 10:01:40 +0100
Subject: [PATCH] Upgrade prom and alert manager to 2.7.1 and 0.16.1

**What**
- Removes the `alert` label in the scale-up alert
- Updates the annotaitons to use the `function_name` label instead of
the `instance` label that was removed.
- Per prometheus/prometheus#4836 and the related mailing list discussion
https://groups.google.com/d/msg/prometheus-users/7Ul6ngc7Ogs/j_YDszV5BwAJ
the alert value should not be included in the alert labels otherwise
each calculation of the alert is treated like a new alert and then the
use of `for 5s` will not behave as expected.
- Ports the resoltuion openfaas/faas-netes#372

Signed-off-by: Lucas Roesler <roesler.lucas@gmail.com>
---
 docker-compose.yml         | 69 +++++++++++++++++++-------------------
 prometheus/alert.rules.yml |  5 ++-
 prometheus/k8s.alert.rules |  9 +++--
 3 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 0f5b9853..7834a509 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,17 +8,17 @@ services:
             - functions
         environment:
             functions_provider_url: "http://faas-swarm:8080/"
-            read_timeout:  "5m5s"           # Maximum time to read HTTP request
-            write_timeout: "5m5s"           # Maximum time to write HTTP response
-            upstream_timeout: "5m"          # Maximum duration of upstream function call - should be more than read_timeout and write_timeout
-            dnsrr: "true"                   # Temporarily use dnsrr in place of VIP while issue persists on PWD
+            read_timeout: "5m5s" # Maximum time to read HTTP request
+            write_timeout: "5m5s" # Maximum time to write HTTP response
+            upstream_timeout: "5m" # Maximum duration of upstream function call - should be more than read_timeout and write_timeout
+            dnsrr: "true" # Temporarily use dnsrr in place of VIP while issue persists on PWD
             faas_nats_address: "nats"
             faas_nats_port: 4222
-            direct_functions: "true"        # Functions are invoked directly over the overlay network
+            direct_functions: "true" # Functions are invoked directly over the overlay network
             direct_functions_suffix: ""
             basic_auth: "${BASIC_AUTH:-true}"
             secret_mount_path: "/run/secrets/"
-            scale_from_zero: "true"         # Enable if you want functions to scale from 0/0 to min replica count upon invoke
+            scale_from_zero: "true" # Enable if you want functions to scale from 0/0 to min replica count upon invoke
             max_idle_conns: 1024
             max_idle_conns_per_host: 1024
         deploy:
@@ -34,7 +34,7 @@ services:
                 window: 380s
             placement:
                 constraints:
-                    - 'node.platform.os == linux'
+                    - "node.platform.os == linux"
         secrets:
             - basic-auth-user
             - basic-auth-password
@@ -43,20 +43,20 @@ services:
     faas-swarm:
         volumes:
             - "/var/run/docker.sock:/var/run/docker.sock"
-        image:  openfaas/faas-swarm:0.6.1
+        image: openfaas/faas-swarm:0.6.1
         networks:
             - functions
         environment:
-            read_timeout:  "5m5s"       # set both here, and on your functions
-            write_timeout: "5m5s"       # set both here, and on your functions
+            read_timeout: "5m5s" # set both here, and on your functions
+            write_timeout: "5m5s" # set both here, and on your functions
             DOCKER_API_VERSION: "1.30"
             basic_auth: "${BASIC_AUTH:-true}"
             secret_mount_path: "/run/secrets/"
         deploy:
             placement:
                 constraints:
-                    - 'node.role == manager'
-                    - 'node.platform.os == linux'
+                    - "node.role == manager"
+                    - "node.platform.os == linux"
             resources:
                 # limits:   # Enable if you want to limit memory usage
                 #     memory: 100M
@@ -89,7 +89,7 @@ services:
                     memory: 50M
             placement:
                 constraints:
-                    - 'node.platform.os == linux'
+                    - "node.platform.os == linux"
 
     queue-worker:
         image: openfaas/queue-worker:0.7.0
@@ -97,7 +97,7 @@ services:
             - functions
         environment:
             max_inflight: "1"
-            ack_wait: "5m5s"    # Max duration of any async task / request
+            ack_wait: "5m5s" # Max duration of any async task / request
             basic_auth: "${BASIC_AUTH:-true}"
             secret_mount_path: "/run/secrets/"
         deploy:
@@ -113,7 +113,7 @@ services:
                 window: 380s
             placement:
                 constraints:
-                    - 'node.platform.os == linux'
+                    - "node.platform.os == linux"
         secrets:
             - basic-auth-user
             - basic-auth-password
@@ -127,12 +127,12 @@ services:
         environment:
             no_proxy: "gateway"
         configs:
-          - source: prometheus_config
-            target: /etc/prometheus/prometheus.yml
-          - source: prometheus_rules
-            target: /etc/prometheus/alert.rules.yml
+            - source: prometheus_config
+              target: /etc/prometheus/prometheus.yml
+            - source: prometheus_rules
+              target: /etc/prometheus/alert.rules.yml
         command:
-          - '--config.file=/etc/prometheus/prometheus.yml'
+            - "--config.file=/etc/prometheus/prometheus.yml"
         #   - '-storage.local.path=/prometheus'
         ports:
             - 9090:9090
@@ -141,8 +141,8 @@ services:
         deploy:
             placement:
                 constraints:
-                    - 'node.role == manager'
-                    - 'node.platform.os == linux'
+                    - "node.role == manager"
+                    - "node.platform.os == linux"
             resources:
                 limits:
                     memory: 500M
@@ -150,12 +150,12 @@ services:
                     memory: 200M
 
     alertmanager:
-        image: prom/alertmanager:v0.15.0
+        image: prom/alertmanager:v0.16.1
         environment:
             no_proxy: "gateway"
         command:
-            - '--config.file=/alertmanager.yml'
-            - '--storage.path=/alertmanager'
+            - "--config.file=/alertmanager.yml"
+            - "--storage.path=/alertmanager"
         networks:
             - functions
         # Uncomment the following port mapping if you wish to expose the Prometheus
@@ -170,29 +170,28 @@ services:
                     memory: 20M
             placement:
                 constraints:
-                    - 'node.role == manager'
-                    - 'node.platform.os == linux'
+                    - "node.role == manager"
+                    - "node.platform.os == linux"
         configs:
             - source: alertmanager_config
               target: /alertmanager.yml
         secrets:
             - basic-auth-password
 
-
 configs:
-     prometheus_config:
-         file: ./prometheus/prometheus.yml
-     prometheus_rules:
-         file: ./prometheus/alert.rules.yml
-     alertmanager_config:
-         file: ./prometheus/alertmanager.yml
+    prometheus_config:
+        file: ./prometheus/prometheus.yml
+    prometheus_rules:
+        file: ./prometheus/alert.rules.yml
+    alertmanager_config:
+        file: ./prometheus/alertmanager.yml
 
 networks:
     functions:
         driver: overlay
         attachable: true
         labels:
-          - "openfaas=true"
+            - "openfaas=true"
 
 secrets:
     basic-auth-user:
diff --git a/prometheus/alert.rules.yml b/prometheus/alert.rules.yml
index 9da14ae7..dc73a055 100644
--- a/prometheus/alert.rules.yml
+++ b/prometheus/alert.rules.yml
@@ -9,7 +9,6 @@ groups:
     labels:
       service: gateway
       severity: major
-      value: '{{$value}}'
     annotations:
-      description: High invocation total on {{ $labels.instance }}
-      summary: High invocation total on {{ $labels.instance }}
+      description: High invocation total on {{ $labels.function_name }}
+      summary: High invocation total on {{ $labels.function_name }}
diff --git a/prometheus/k8s.alert.rules b/prometheus/k8s.alert.rules
index 10d5b7cd..e2b95d95 100644
--- a/prometheus/k8s.alert.rules
+++ b/prometheus/k8s.alert.rules
@@ -2,15 +2,14 @@ ALERT service_down
   IF up == 0
 
 ALERT APIHighInvocationRate
-  IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5 
+  IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5
   FOR 5s
   LABELS {
     service = "gateway",
     severity = "major",
-    value = "{{$value}}"
   }
   ANNOTATIONS {
-    summary = "High invocation total on {{ $labels.instance }}",
-    description =  "High invocation total on {{ $labels.instance }}"
-  } 
+    summary = "High invocation total on {{ $labels.function_name }}",
+    description =  "High invocation total on {{ $labels.function_name }}"
+  }