Commit 1b84a924 by Zzy Committed by Marcus Efraimsson

Alerting: Makes timeouts and retries configurable (#16259)

Adds new alert settings for configuring timeouts and retries named 
evaluation_timeout_seconds, notification_timeout_seconds 
and max_attempts.

Closes #16240
parent e6d9a524
......@@ -521,6 +521,16 @@ nodata_or_nullvalues = no_data
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
concurrent_render_limit = 5
# Default setting for alert calculation timeout. Default value is 30
evaluation_timeout_seconds = 30
# Default setting for alert notification timeout. Default value is 30
notification_timeout_seconds = 30
# Default setting for max attempts to sending alert notifications. Default value is 3
max_attempts = 3
#################################### Explore #############################
[explore]
# Enable the Explore section
......
......@@ -446,6 +446,16 @@ log_queries =
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
;concurrent_render_limit = 5
# Default setting for alert calculation timeout. Default value is 30
;evaluation_timeout_seconds = 30
# Default setting for alert notification timeout. Default value is 30
;notification_timeout_seconds = 30
# Default setting for max attempts to sending alert notifications. Default value is 3
;max_attempts = 3
#################################### Explore #############################
[explore]
# Enable the Explore section
......
......@@ -650,6 +650,20 @@ Alert notifications can include images, but rendering many images at the same ti
This limit will protect the server from render overloading and make sure notifications are sent out quickly. Default
value is `5`.
### evaluation_timeout_seconds
Default setting for alert calculation timeout. Default value is `30`
### notification_timeout_seconds
Default setting for alert notification timeout. Default value is `30`
### max_attempts
Default setting for max attempts to sending alert notifications. Default value is `3`
## [panels]
### enable_alpha
......
......@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
var (
unfinishedWorkTimeout = time.Second * 5
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
alertTimeout = time.Second * 30
resultHandleTimeout = time.Second * 30
alertMaxAttempts = 3
)
func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
......@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
}
}()
cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
attemptChan := make(chan int, 1)
// Initialize with first attemptID=1
......@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}
}()
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
cancelChan <- cancelFn
span := opentracing.StartSpan("alert execution")
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
......@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
tlog.Error(evalContext.Error),
tlog.String("message", "alerting execution attempt failed"),
)
if attemptID < alertMaxAttempts {
if attemptID < setting.AlertingMaxAttempts {
span.Finish()
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
attemptChan <- (attemptID + 1)
......@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}
// create new context with timeout for notifications
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
cancelChan <- resultHandleCancelFn
// override the context used for evaluation with a new context for notifications.
......
......@@ -11,20 +11,22 @@ import (
"testing"
"time"
"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
)
func TestEngineTimeouts(t *testing.T) {
Convey("Alerting engine timeout tests", t, func() {
engine := NewEngine()
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}
Convey("Should trigger as many retries as needed", func() {
Convey("pended alert for datasource -> result handler should be worked", func() {
// reduce alert timeout to test quickly
originAlertTimeout := alertTimeout
alertTimeout = 2 * time.Second
setting.AlertingEvaluationTimeout = 30 * time.Second
transportTimeoutInterval := 2 * time.Second
serverBusySleepDuration := 1 * time.Second
......@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
// initialize for other tests.
alertTimeout = originAlertTimeout
setting.AlertingEvaluationTimeout = 2 * time.Second
engine.resultHandler = &FakeResultHandler{}
})
})
......
......@@ -6,7 +6,9 @@ import (
"math"
"testing"
"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
"time"
)
type FakeEvalHandler struct {
......@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
func TestEngineProcessJob(t *testing.T) {
Convey("Alerting engine job processing", t, func() {
engine := NewEngine()
setting.AlertingEvaluationTimeout = 30 * time.Second
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}
......@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + not last attempt -> retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)
for i := 1; i < alertMaxAttempts; i++ {
for i := 1; i < setting.AlertingMaxAttempts; i++ {
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(i, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
......@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + last attempt -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
So(nextAttemptID, ShouldEqual, 0)
......@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("no error -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(1)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(1, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
......@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("Should trigger as many retries as needed", func() {
Convey("never success -> max retries number", func() {
expectedAttempts := alertMaxAttempts
expectedAttempts := setting.AlertingMaxAttempts
evalHandler := NewFakeEvalHandler(0)
engine.evalHandler = evalHandler
......@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
})
Convey("some errors before success -> some retries", func() {
expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
evalHandler := NewFakeEvalHandler(expectedAttempts)
engine.evalHandler = evalHandler
......
......@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
renderOpts := rendering.Opts{
Width: 1000,
Height: 500,
Timeout: time.Duration(float64(alertTimeout) * 0.9),
Timeout: time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
OrgId: context.Rule.OrgId,
OrgRole: m.ROLE_ADMIN,
ConcurrentLimit: setting.AlertingRenderLimit,
......
......@@ -179,6 +179,10 @@ var (
AlertingErrorOrTimeout string
AlertingNoDataOrNullValues string
AlertingEvaluationTimeout time.Duration
AlertingNotificationTimeout time.Duration
AlertingMaxAttempts int
// Explore UI
ExploreEnabled bool
......@@ -760,6 +764,10 @@ func (cfg *Cfg) Load(args *CommandLineArgs) error {
AlertingErrorOrTimeout = alerting.Key("error_or_timeout").MustString("alerting")
AlertingNoDataOrNullValues = alerting.Key("nodata_or_nullvalues").MustString("no_data")
AlertingEvaluationTimeout = alerting.Key("evaluation_timeout_seconds").MustDuration(time.Second * 30)
AlertingNotificationTimeout = alerting.Key("notification_timeout_seconds").MustDuration(time.Second * 30)
AlertingMaxAttempts = alerting.Key("max_attempts").MustInt(3)
explore := iniFile.Section("explore")
ExploreEnabled = explore.Key("enabled").MustBool(true)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment