Upgrade prom and alert manager to 2.7.1 and 0.16.1

**What**
- Removes the `alert` label in the scale-up alert
- Updates the annotaitons to use the `function_name` label instead of
the `instance` label that was removed.
- Per prometheus/prometheus#4836 and the related mailing list discussion
https://groups.google.com/d/msg/prometheus-users/7Ul6ngc7Ogs/j_YDszV5BwAJ
the alert value should not be included in the alert labels otherwise
each calculation of the alert is treated like a new alert and then the
use of `for 5s` will not behave as expected.
- Ports the resoltuion openfaas/faas-netes#372

Signed-off-by: Lucas Roesler <roesler.lucas@gmail.com>
This commit is contained in:
Lucas Roesler 2019-02-21 10:01:40 +01:00 committed by Alex Ellis
parent f2ac7b906c
commit 3bcc10a07e
3 changed files with 40 additions and 43 deletions

View File

@ -8,17 +8,17 @@ services:
- functions
environment:
functions_provider_url: "http://faas-swarm:8080/"
read_timeout: "5m5s" # Maximum time to read HTTP request
write_timeout: "5m5s" # Maximum time to write HTTP response
upstream_timeout: "5m" # Maximum duration of upstream function call - should be more than read_timeout and write_timeout
dnsrr: "true" # Temporarily use dnsrr in place of VIP while issue persists on PWD
read_timeout: "5m5s" # Maximum time to read HTTP request
write_timeout: "5m5s" # Maximum time to write HTTP response
upstream_timeout: "5m" # Maximum duration of upstream function call - should be more than read_timeout and write_timeout
dnsrr: "true" # Temporarily use dnsrr in place of VIP while issue persists on PWD
faas_nats_address: "nats"
faas_nats_port: 4222
direct_functions: "true" # Functions are invoked directly over the overlay network
direct_functions: "true" # Functions are invoked directly over the overlay network
direct_functions_suffix: ""
basic_auth: "${BASIC_AUTH:-true}"
secret_mount_path: "/run/secrets/"
scale_from_zero: "true" # Enable if you want functions to scale from 0/0 to min replica count upon invoke
scale_from_zero: "true" # Enable if you want functions to scale from 0/0 to min replica count upon invoke
max_idle_conns: 1024
max_idle_conns_per_host: 1024
deploy:
@ -34,7 +34,7 @@ services:
window: 380s
placement:
constraints:
- 'node.platform.os == linux'
- "node.platform.os == linux"
secrets:
- basic-auth-user
- basic-auth-password
@ -43,20 +43,20 @@ services:
faas-swarm:
volumes:
- "/var/run/docker.sock:/var/run/docker.sock"
image: openfaas/faas-swarm:0.6.1
image: openfaas/faas-swarm:0.6.1
networks:
- functions
environment:
read_timeout: "5m5s" # set both here, and on your functions
write_timeout: "5m5s" # set both here, and on your functions
read_timeout: "5m5s" # set both here, and on your functions
write_timeout: "5m5s" # set both here, and on your functions
DOCKER_API_VERSION: "1.30"
basic_auth: "${BASIC_AUTH:-true}"
secret_mount_path: "/run/secrets/"
deploy:
placement:
constraints:
- 'node.role == manager'
- 'node.platform.os == linux'
- "node.role == manager"
- "node.platform.os == linux"
resources:
# limits: # Enable if you want to limit memory usage
# memory: 100M
@ -89,7 +89,7 @@ services:
memory: 50M
placement:
constraints:
- 'node.platform.os == linux'
- "node.platform.os == linux"
queue-worker:
image: openfaas/queue-worker:0.7.0
@ -97,7 +97,7 @@ services:
- functions
environment:
max_inflight: "1"
ack_wait: "5m5s" # Max duration of any async task / request
ack_wait: "5m5s" # Max duration of any async task / request
basic_auth: "${BASIC_AUTH:-true}"
secret_mount_path: "/run/secrets/"
deploy:
@ -113,7 +113,7 @@ services:
window: 380s
placement:
constraints:
- 'node.platform.os == linux'
- "node.platform.os == linux"
secrets:
- basic-auth-user
- basic-auth-password
@ -127,12 +127,12 @@ services:
environment:
no_proxy: "gateway"
configs:
- source: prometheus_config
target: /etc/prometheus/prometheus.yml
- source: prometheus_rules
target: /etc/prometheus/alert.rules.yml
- source: prometheus_config
target: /etc/prometheus/prometheus.yml
- source: prometheus_rules
target: /etc/prometheus/alert.rules.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- "--config.file=/etc/prometheus/prometheus.yml"
# - '-storage.local.path=/prometheus'
ports:
- 9090:9090
@ -141,8 +141,8 @@ services:
deploy:
placement:
constraints:
- 'node.role == manager'
- 'node.platform.os == linux'
- "node.role == manager"
- "node.platform.os == linux"
resources:
limits:
memory: 500M
@ -150,12 +150,12 @@ services:
memory: 200M
alertmanager:
image: prom/alertmanager:v0.15.0
image: prom/alertmanager:v0.16.1
environment:
no_proxy: "gateway"
command:
- '--config.file=/alertmanager.yml'
- '--storage.path=/alertmanager'
- "--config.file=/alertmanager.yml"
- "--storage.path=/alertmanager"
networks:
- functions
# Uncomment the following port mapping if you wish to expose the Prometheus
@ -170,29 +170,28 @@ services:
memory: 20M
placement:
constraints:
- 'node.role == manager'
- 'node.platform.os == linux'
- "node.role == manager"
- "node.platform.os == linux"
configs:
- source: alertmanager_config
target: /alertmanager.yml
secrets:
- basic-auth-password
configs:
prometheus_config:
file: ./prometheus/prometheus.yml
prometheus_rules:
file: ./prometheus/alert.rules.yml
alertmanager_config:
file: ./prometheus/alertmanager.yml
prometheus_config:
file: ./prometheus/prometheus.yml
prometheus_rules:
file: ./prometheus/alert.rules.yml
alertmanager_config:
file: ./prometheus/alertmanager.yml
networks:
functions:
driver: overlay
attachable: true
labels:
- "openfaas=true"
- "openfaas=true"
secrets:
basic-auth-user:

View File

@ -9,7 +9,6 @@ groups:
labels:
service: gateway
severity: major
value: '{{$value}}'
annotations:
description: High invocation total on {{ $labels.instance }}
summary: High invocation total on {{ $labels.instance }}
description: High invocation total on {{ $labels.function_name }}
summary: High invocation total on {{ $labels.function_name }}

View File

@ -2,15 +2,14 @@ ALERT service_down
IF up == 0
ALERT APIHighInvocationRate
IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5
IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5
FOR 5s
LABELS {
service = "gateway",
severity = "major",
value = "{{$value}}"
}
ANNOTATIONS {
summary = "High invocation total on {{ $labels.instance }}",
description = "High invocation total on {{ $labels.instance }}"
}
summary = "High invocation total on {{ $labels.function_name }}",
description = "High invocation total on {{ $labels.function_name }}"
}