Tweak alertmanager timeout + simplify down-scale of replicas

2025-06-09 00:36:46 +00:00 · 2017-01-23 22:44:03 +00:00 · 2017-01-23 22:44:03 +00:00 · cdd5219200
commit cdd5219200
parent 59ca597903
2 changed files with 16 additions and 6 deletions
--- a/gateway/server.go
+++ b/gateway/server.go
@ -32,12 +32,21 @@ func scaleService(req requests.PrometheusAlert, c *client.Client) error {
 			} else {
 				return err
 			}
-		} else {
-			replicas = *service.Spec.Mode.Replicated.Replicas - uint64(5)
-			if replicas <= 0 {
+		} else { // Resolved event.
+			// Previously decremented by 5, but event only fires once, so set to 1/1.
+			if *service.Spec.Mode.Replicated.Replicas > 1 {
+				// replicas = *service.Spec.Mode.Replicated.Replicas - uint64(5)
+				// if replicas < 1 {
+				// replicas = 1
+				// }
+				// return nil
+
 				replicas = 1
+			} else {
+				return nil
 			}
 		}
+
 		log.Printf("Scaling %s to %d replicas.\n", serviceName, replicas)

 		service.Spec.Mode.Replicated.Replicas = &replicas
--- a/prometheus/alertmanager.yml
+++ b/prometheus/alertmanager.yml
@ -25,15 +25,15 @@ route:
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first 
  # notification.
-  group_wait: 30s
+  group_wait: 5s

  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
-  group_interval: 5m
+  group_interval: 10s

  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
-  repeat_interval: 3h 
+  repeat_interval: 30s 

  # A default receiver
  receiver: scale-up
@ -66,3 +66,4 @@ receivers:
  webhook_configs:
    - url: http://gateway:8080/system/alert
      send_resolved: true
+