Commit 50d98b16 by bergquist

feat(alerting): adds support for retries

parent 68f01d57
......@@ -113,6 +113,7 @@ type AlertJob struct {
Offset int64
Delay bool
Running bool
Retry int
Rule AlertRule
}
......
......@@ -27,6 +27,7 @@ var (
AlertStateCritical = "CRITICAL"
AlertStateAcknowledged = "ACKNOWLEDGED"
AlertStateMaintenance = "MAINTENANCE"
AlertStatePending = "PENDING"
)
func (this *UpdateAlertStateCommand) IsValidState() bool {
......
package alerting
import (
"fmt"
"time"
"github.com/grafana/grafana/pkg/bus"
......@@ -9,6 +10,10 @@ import (
"github.com/grafana/grafana/pkg/setting"
)
var (
MaxRetries = 3
)
func Init() {
if !setting.AlertingEnabled {
return
......@@ -70,6 +75,7 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) {
} else {
job = &m.AlertJob{
Running: false,
Retry: 0,
}
}
......@@ -104,18 +110,32 @@ func (scheduler *Scheduler) executor(executor Executor) {
func (scheduler *Scheduler) handleResponses() {
for response := range scheduler.responseQueue {
log.Info("Response: alert(%d) status(%s) actual(%v) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Running)
log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Retry, response.AlertJob.Running)
response.AlertJob.Running = false
cmd := &m.UpdateAlertStateCommand{
AlertId: response.Id,
NewState: response.State,
Info: response.Description,
if response.State == m.AlertStatePending {
response.AlertJob.Retry++
if response.AlertJob.Retry > MaxRetries {
response.State = m.AlertStateCritical
response.Description = fmt.Sprintf("Failed to run check after %d retires", MaxRetries)
scheduler.saveState(response)
}
} else {
response.AlertJob.Retry = 0
scheduler.saveState(response)
}
}
}
if err := bus.Dispatch(cmd); err != nil {
log.Error(2, "failed to save state %v", err)
}
func (scheduler *Scheduler) saveState(response *m.AlertResult) {
cmd := &m.UpdateAlertStateCommand{
AlertId: response.Id,
NewState: response.State,
Info: response.Description,
}
if err := bus.Dispatch(cmd); err != nil {
log.Error(2, "failed to save state %v", err)
}
}
......@@ -129,7 +149,7 @@ func (scheduler *Scheduler) measureAndExecute(exec Executor, job *m.AlertJob) {
case <-time.After(time.Second * 5):
scheduler.responseQueue <- &m.AlertResult{
Id: job.Rule.Id,
State: "timed out",
State: m.AlertStatePending,
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
AlertJob: job,
}
......
......@@ -81,7 +81,7 @@ func (this *ExecutorImpl) Execute(job *m.AlertJob, responseQueue chan *m.AlertRe
response, err := b.GetSeries(job)
if err != nil {
responseQueue <- &m.AlertResult{State: "PENDING", Id: job.Rule.Id, AlertJob: job}
responseQueue <- &m.AlertResult{State: m.AlertStatePending, Id: job.Rule.Id, AlertJob: job}
}
result := this.validateRule(job.Rule, response)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment