From 3bcc10a07e264684d81355ea08a7403673b76bb1 Mon Sep 17 00:00:00 2001 From: Lucas Roesler Date: Thu, 21 Feb 2019 10:01:40 +0100 Subject: [PATCH] Upgrade prom and alert manager to 2.7.1 and 0.16.1 **What** - Removes the `alert` label in the scale-up alert - Updates the annotaitons to use the `function_name` label instead of the `instance` label that was removed. - Per prometheus/prometheus#4836 and the related mailing list discussion https://groups.google.com/d/msg/prometheus-users/7Ul6ngc7Ogs/j_YDszV5BwAJ the alert value should not be included in the alert labels otherwise each calculation of the alert is treated like a new alert and then the use of `for 5s` will not behave as expected. - Ports the resoltuion openfaas/faas-netes#372 Signed-off-by: Lucas Roesler --- docker-compose.yml | 69 +++++++++++++++++++------------------- prometheus/alert.rules.yml | 5 ++- prometheus/k8s.alert.rules | 9 +++-- 3 files changed, 40 insertions(+), 43 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0f5b9853..7834a509 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,17 +8,17 @@ services: - functions environment: functions_provider_url: "http://faas-swarm:8080/" - read_timeout: "5m5s" # Maximum time to read HTTP request - write_timeout: "5m5s" # Maximum time to write HTTP response - upstream_timeout: "5m" # Maximum duration of upstream function call - should be more than read_timeout and write_timeout - dnsrr: "true" # Temporarily use dnsrr in place of VIP while issue persists on PWD + read_timeout: "5m5s" # Maximum time to read HTTP request + write_timeout: "5m5s" # Maximum time to write HTTP response + upstream_timeout: "5m" # Maximum duration of upstream function call - should be more than read_timeout and write_timeout + dnsrr: "true" # Temporarily use dnsrr in place of VIP while issue persists on PWD faas_nats_address: "nats" faas_nats_port: 4222 - direct_functions: "true" # Functions are invoked directly over the overlay network + direct_functions: "true" # Functions are invoked directly over the overlay network direct_functions_suffix: "" basic_auth: "${BASIC_AUTH:-true}" secret_mount_path: "/run/secrets/" - scale_from_zero: "true" # Enable if you want functions to scale from 0/0 to min replica count upon invoke + scale_from_zero: "true" # Enable if you want functions to scale from 0/0 to min replica count upon invoke max_idle_conns: 1024 max_idle_conns_per_host: 1024 deploy: @@ -34,7 +34,7 @@ services: window: 380s placement: constraints: - - 'node.platform.os == linux' + - "node.platform.os == linux" secrets: - basic-auth-user - basic-auth-password @@ -43,20 +43,20 @@ services: faas-swarm: volumes: - "/var/run/docker.sock:/var/run/docker.sock" - image: openfaas/faas-swarm:0.6.1 + image: openfaas/faas-swarm:0.6.1 networks: - functions environment: - read_timeout: "5m5s" # set both here, and on your functions - write_timeout: "5m5s" # set both here, and on your functions + read_timeout: "5m5s" # set both here, and on your functions + write_timeout: "5m5s" # set both here, and on your functions DOCKER_API_VERSION: "1.30" basic_auth: "${BASIC_AUTH:-true}" secret_mount_path: "/run/secrets/" deploy: placement: constraints: - - 'node.role == manager' - - 'node.platform.os == linux' + - "node.role == manager" + - "node.platform.os == linux" resources: # limits: # Enable if you want to limit memory usage # memory: 100M @@ -89,7 +89,7 @@ services: memory: 50M placement: constraints: - - 'node.platform.os == linux' + - "node.platform.os == linux" queue-worker: image: openfaas/queue-worker:0.7.0 @@ -97,7 +97,7 @@ services: - functions environment: max_inflight: "1" - ack_wait: "5m5s" # Max duration of any async task / request + ack_wait: "5m5s" # Max duration of any async task / request basic_auth: "${BASIC_AUTH:-true}" secret_mount_path: "/run/secrets/" deploy: @@ -113,7 +113,7 @@ services: window: 380s placement: constraints: - - 'node.platform.os == linux' + - "node.platform.os == linux" secrets: - basic-auth-user - basic-auth-password @@ -127,12 +127,12 @@ services: environment: no_proxy: "gateway" configs: - - source: prometheus_config - target: /etc/prometheus/prometheus.yml - - source: prometheus_rules - target: /etc/prometheus/alert.rules.yml + - source: prometheus_config + target: /etc/prometheus/prometheus.yml + - source: prometheus_rules + target: /etc/prometheus/alert.rules.yml command: - - '--config.file=/etc/prometheus/prometheus.yml' + - "--config.file=/etc/prometheus/prometheus.yml" # - '-storage.local.path=/prometheus' ports: - 9090:9090 @@ -141,8 +141,8 @@ services: deploy: placement: constraints: - - 'node.role == manager' - - 'node.platform.os == linux' + - "node.role == manager" + - "node.platform.os == linux" resources: limits: memory: 500M @@ -150,12 +150,12 @@ services: memory: 200M alertmanager: - image: prom/alertmanager:v0.15.0 + image: prom/alertmanager:v0.16.1 environment: no_proxy: "gateway" command: - - '--config.file=/alertmanager.yml' - - '--storage.path=/alertmanager' + - "--config.file=/alertmanager.yml" + - "--storage.path=/alertmanager" networks: - functions # Uncomment the following port mapping if you wish to expose the Prometheus @@ -170,29 +170,28 @@ services: memory: 20M placement: constraints: - - 'node.role == manager' - - 'node.platform.os == linux' + - "node.role == manager" + - "node.platform.os == linux" configs: - source: alertmanager_config target: /alertmanager.yml secrets: - basic-auth-password - configs: - prometheus_config: - file: ./prometheus/prometheus.yml - prometheus_rules: - file: ./prometheus/alert.rules.yml - alertmanager_config: - file: ./prometheus/alertmanager.yml + prometheus_config: + file: ./prometheus/prometheus.yml + prometheus_rules: + file: ./prometheus/alert.rules.yml + alertmanager_config: + file: ./prometheus/alertmanager.yml networks: functions: driver: overlay attachable: true labels: - - "openfaas=true" + - "openfaas=true" secrets: basic-auth-user: diff --git a/prometheus/alert.rules.yml b/prometheus/alert.rules.yml index 9da14ae7..dc73a055 100644 --- a/prometheus/alert.rules.yml +++ b/prometheus/alert.rules.yml @@ -9,7 +9,6 @@ groups: labels: service: gateway severity: major - value: '{{$value}}' annotations: - description: High invocation total on {{ $labels.instance }} - summary: High invocation total on {{ $labels.instance }} + description: High invocation total on {{ $labels.function_name }} + summary: High invocation total on {{ $labels.function_name }} diff --git a/prometheus/k8s.alert.rules b/prometheus/k8s.alert.rules index 10d5b7cd..e2b95d95 100644 --- a/prometheus/k8s.alert.rules +++ b/prometheus/k8s.alert.rules @@ -2,15 +2,14 @@ ALERT service_down IF up == 0 ALERT APIHighInvocationRate - IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5 + IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5 FOR 5s LABELS { service = "gateway", severity = "major", - value = "{{$value}}" } ANNOTATIONS { - summary = "High invocation total on {{ $labels.instance }}", - description = "High invocation total on {{ $labels.instance }}" - } + summary = "High invocation total on {{ $labels.function_name }}", + description = "High invocation total on {{ $labels.function_name }}" + }