Introduce gateway_function_invocation_total to track individual functions

Introduce prometheus_alertmanager into stack - have it fire into webhook stash
2025-06-24 07:43:25 +00:00 · 2017-01-21 10:11:33 +00:00
parent a9e72cd0a3
commit ab2f8e85f3
8 changed files with 118 additions and 8 deletions
--- a/prometheus/alert.rules
+++ b/prometheus/alert.rules
@ -0,0 +1,15 @@
+ALERT service_down
+  IF up == 0
+
+ALERT APIHighInvocationRate
+  IF rate ( gateway_function_invocation_total [10s] ) > 5
+  FOR 30s
+  ANNOTATIONS {
+    summary = "High invocation total on {{ $labels.instance }}",
+    description =  "High invocation total on {{ $labels.instance }}",
+  }
+  LABELS {
+    service = "gateway",
+    severity = "major",
+    value = "{{$value}}",
+  }
--- a/prometheus/alertmanager.yml
+++ b/prometheus/alertmanager.yml
@ -0,0 +1,68 @@
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@example.org'
+  smtp_auth_username: 'alertmanager'
+  smtp_auth_password: 'password'
+  # The auth token for Hipchat.
+  hipchat_auth_token: '1234556789'
+  # Alternative host for Hipchat.
+  hipchat_url: 'https://hipchat.foobar.org/'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h 
+
+  # A default receiver
+  receiver: scale-up
+
+  # All the above attributes are inherited by all child routes and can 
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+  - match:
+    service: gateway
+    receiver: scale-up
+    severity: major
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+receivers:
+- name: 'scale-up'
+  webhook_configs:
+    - url: http://gateway:8080/function/func_webhookstash
+      send_resolved: true
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@ -7,12 +7,12 @@ global:
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
-      monitor: 'codelab-monitor'
+      monitor: 'faas-monitor'

 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
-  # - "first.rules"
-  # - "second.rules"
+    - 'alert.rules'
+

 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
@ -29,6 +29,6 @@ scrape_configs:
      - targets: ['localhost:9090']

  - job_name: "gateway"
-    scrape_interval: "15s"
+    scrape_interval: 5s
    static_configs:
      - targets: ['gateway:8080']