Commit 71ffd1d1 by Marcus Efraimsson Committed by GitHub

Alerting: Fix image rendering and uploading timeout preventing to send alert notifications (#21536)

* svc alerting - use a shorter ctx to upload the img
This will prevent timeout on img upload to cancel the notifications from being sent

* components img uploader - pass the ctx to aws lib

* make webdavuploader use the ctx

* make azureblobuploader use the ctx

* rename uploadImage() to renderAndUploadImage()
for better clarity about what this method work

* Use timeout + 2s for plugin renderer (same as service and phantomjs)

* Make sure that original EvalContext is updated after render and upload

* Verify notification sent even if render or image upload times out

* fix lint

* fixes after review

Co-authored-by: Edouard Hur <3418467+hekmon@users.noreply.github.com>

Fixes #21018
parent 6e412d88
......@@ -59,7 +59,7 @@ func (az *AzureBlobUploader) Upload(ctx context.Context, imageDiskPath string) (
randomFileName += pngExt
// upload image
az.log.Debug("Uploading image to azure_blob", "container_name", az.container_name, "blob_name", randomFileName)
resp, err := blob.FileUpload(az.container_name, randomFileName, file)
resp, err := blob.FileUpload(ctx, az.container_name, randomFileName, file)
if err != nil {
return "", err
}
......@@ -162,7 +162,7 @@ func copyHeadersToRequest(req *http.Request, headers map[string]string) {
}
}
func (c *StorageClient) FileUpload(container, blobName string, body io.Reader) (*http.Response, error) {
func (c *StorageClient) FileUpload(ctx context.Context, container, blobName string, body io.Reader) (*http.Response, error) {
blobName = escape(blobName)
extension := strings.ToLower(path.Ext(blobName))
contentType := mime.TypeByExtension(extension)
......@@ -178,6 +178,9 @@ func (c *StorageClient) FileUpload(container, blobName string, body io.Reader) (
if err != nil {
return nil, err
}
if ctx != nil {
req = req.WithContext(ctx)
}
copyHeadersToRequest(req, map[string]string{
"x-ms-blob-type": "BlockBlob",
......
......@@ -83,7 +83,7 @@ func (u *S3Uploader) Upload(ctx context.Context, imageDiskPath string) (string,
return "", err
}
uploader := s3manager.NewUploader(sess)
result, err := uploader.Upload(&s3manager.UploadInput{
result, err := uploader.UploadWithContext(ctx, &s3manager.UploadInput{
Bucket: aws.String(u.bucket),
Key: aws.String(key),
ACL: aws.String(u.acl),
......
......@@ -64,7 +64,9 @@ func (u *WebdavUploader) Upload(ctx context.Context, pa string) (string, error)
if err != nil {
return "", err
}
if ctx != nil {
req = req.WithContext(ctx)
}
if u.username != "" {
req.SetBasicAuth(u.username, u.password)
}
......
package alerting
import (
"context"
"errors"
"fmt"
"time"
"github.com/grafana/grafana/pkg/bus"
"github.com/grafana/grafana/pkg/components/imguploader"
......@@ -39,8 +41,8 @@ type notificationService struct {
renderService rendering.Service
}
func (n *notificationService) SendIfNeeded(context *EvalContext) error {
notifierStates, err := n.getNeededNotifiers(context.Rule.OrgID, context.Rule.Notifications, context)
func (n *notificationService) SendIfNeeded(evalCtx *EvalContext) error {
notifierStates, err := n.getNeededNotifiers(evalCtx.Rule.OrgID, evalCtx.Rule.Notifications, evalCtx)
if err != nil {
n.log.Error("Failed to get alert notifiers", "error", err)
return err
......@@ -51,12 +53,22 @@ func (n *notificationService) SendIfNeeded(context *EvalContext) error {
}
if notifierStates.ShouldUploadImage() {
if err = n.uploadImage(context); err != nil {
n.log.Error("Failed to upload alert panel image.", "error", err)
// Create a copy of EvalContext and give it a new, shorter, timeout context to upload the image
uploadEvalCtx := *evalCtx
timeout := setting.AlertingNotificationTimeout / 2
var uploadCtxCancel func()
uploadEvalCtx.Ctx, uploadCtxCancel = context.WithTimeout(evalCtx.Ctx, timeout)
// Try to upload the image without consuming all the time allocated for EvalContext
if err = n.renderAndUploadImage(&uploadEvalCtx, timeout); err != nil {
n.log.Error("Failed to render and upload alert panel image.", "ruleId", uploadEvalCtx.Rule.ID, "error", err)
}
uploadCtxCancel()
evalCtx.ImageOnDiskPath = uploadEvalCtx.ImageOnDiskPath
evalCtx.ImagePublicURL = uploadEvalCtx.ImagePublicURL
}
return n.sendNotifications(context, notifierStates)
return n.sendNotifications(evalCtx, notifierStates)
}
func (n *notificationService) sendAndMarkAsComplete(evalContext *EvalContext, notifierState *notifierState) error {
......@@ -123,7 +135,7 @@ func (n *notificationService) sendNotifications(evalContext *EvalContext, notifi
return nil
}
func (n *notificationService) uploadImage(context *EvalContext) (err error) {
func (n *notificationService) renderAndUploadImage(evalCtx *EvalContext, timeout time.Duration) (err error) {
uploader, err := newImageUploaderProvider()
if err != nil {
return err
......@@ -132,32 +144,41 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
renderOpts := rendering.Opts{
Width: 1000,
Height: 500,
Timeout: setting.AlertingEvaluationTimeout,
OrgId: context.Rule.OrgID,
Timeout: timeout,
OrgId: evalCtx.Rule.OrgID,
OrgRole: models.ROLE_ADMIN,
ConcurrentLimit: setting.AlertingRenderLimit,
}
ref, err := context.GetDashboardUID()
ref, err := evalCtx.GetDashboardUID()
if err != nil {
return err
}
renderOpts.Path = fmt.Sprintf("d-solo/%s/%s?orgId=%d&panelId=%d", ref.Uid, ref.Slug, context.Rule.OrgID, context.Rule.PanelID)
renderOpts.Path = fmt.Sprintf("d-solo/%s/%s?orgId=%d&panelId=%d", ref.Uid, ref.Slug, evalCtx.Rule.OrgID, evalCtx.Rule.PanelID)
result, err := n.renderService.Render(context.Ctx, renderOpts)
n.log.Debug("Rendering alert panel image", "ruleId", evalCtx.Rule.ID, "urlPath", renderOpts.Path)
start := time.Now()
result, err := n.renderService.Render(evalCtx.Ctx, renderOpts)
if err != nil {
return err
}
took := time.Since(start)
context.ImageOnDiskPath = result.FilePath
context.ImagePublicURL, err = uploader.Upload(context.Ctx, context.ImageOnDiskPath)
n.log.Debug("Rendered alert panel image", "ruleId", evalCtx.Rule.ID, "path", result.FilePath, "took", took)
evalCtx.ImageOnDiskPath = result.FilePath
n.log.Debug("Uploading alert panel image to external image store", "ruleId", evalCtx.Rule.ID, "path", evalCtx.ImageOnDiskPath)
start = time.Now()
evalCtx.ImagePublicURL, err = uploader.Upload(evalCtx.Ctx, evalCtx.ImageOnDiskPath)
if err != nil {
return err
}
took = time.Since(start)
if context.ImagePublicURL != "" {
n.log.Info("uploaded screenshot of alert to external image store", "url", context.ImagePublicURL)
if evalCtx.ImagePublicURL != "" {
n.log.Debug("Uploaded alert panel image to external image store", "ruleId", evalCtx.Rule.ID, "url", evalCtx.ImagePublicURL, "took", took)
}
return nil
......
......@@ -6,6 +6,7 @@ import (
"time"
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/services/rendering"
......@@ -29,7 +30,7 @@ func TestNotificationService(t *testing.T) {
}
evalCtx := NewEvalContext(context.Background(), testRule)
notificationServiceScenario(t, "SendIfNeeded should render and upload image and send notification", evalCtx, true, func(scenarioCtx *scenarioContext) {
notificationServiceScenario(t, "Given alert rule with upload image enabled should render and upload image and send notification", evalCtx, true, func(scenarioCtx *scenarioContext) {
err := scenarioCtx.notificationService.SendIfNeeded(evalCtx)
require.NoError(t, err)
......@@ -38,12 +39,70 @@ func TestNotificationService(t *testing.T) {
require.Truef(t, evalCtx.Ctx.Value("notificationSent").(bool), "expected notification to be sent, but wasn't")
})
notificationServiceScenario(t, "SendIfNeeded should not render and upload image, but send notification", evalCtx, false, func(scenarioCtx *scenarioContext) {
notificationServiceScenario(t, "Given alert rule with upload image disabled should not render and upload image, but send notification", evalCtx, false, func(scenarioCtx *scenarioContext) {
err := scenarioCtx.notificationService.SendIfNeeded(evalCtx)
require.NoError(t, err)
require.Equalf(t, 0, scenarioCtx.renderCount, "expected render to be called, but wasn't")
require.Equalf(t, 0, scenarioCtx.imageUploadCount, "expected image to be uploaded, but wasn't")
require.Equalf(t, 0, scenarioCtx.renderCount, "expected render not to be called, but it was")
require.Equalf(t, 0, scenarioCtx.imageUploadCount, "expected image not to be uploaded, but it was")
require.Truef(t, evalCtx.Ctx.Value("notificationSent").(bool), "expected notification to be sent, but wasn't")
})
notificationServiceScenario(t, "Given alert rule with upload image enabled and render times out should send notification", evalCtx, true, func(scenarioCtx *scenarioContext) {
setting.AlertingNotificationTimeout = 200 * time.Millisecond
scenarioCtx.renderProvider = func(ctx context.Context, opts rendering.Opts) (*rendering.RenderResult, error) {
wait := make(chan bool)
go func() {
time.Sleep(1 * time.Second)
wait <- true
}()
select {
case <-ctx.Done():
if err := ctx.Err(); err != nil {
return nil, err
}
break
case <-wait:
}
return nil, nil
}
err := scenarioCtx.notificationService.SendIfNeeded(evalCtx)
require.NoError(t, err)
require.Equalf(t, 0, scenarioCtx.renderCount, "expected render not to be called, but it was")
require.Equalf(t, 0, scenarioCtx.imageUploadCount, "expected image not to be uploaded, but it was")
require.Truef(t, evalCtx.Ctx.Value("notificationSent").(bool), "expected notification to be sent, but wasn't")
})
notificationServiceScenario(t, "Given alert rule with upload image enabled and upload times out should send notification", evalCtx, true, func(scenarioCtx *scenarioContext) {
setting.AlertingNotificationTimeout = 200 * time.Millisecond
scenarioCtx.uploadProvider = func(ctx context.Context, path string) (string, error) {
wait := make(chan bool)
go func() {
time.Sleep(1 * time.Second)
wait <- true
}()
select {
case <-ctx.Done():
if err := ctx.Err(); err != nil {
return "", err
}
break
case <-wait:
}
return "", nil
}
err := scenarioCtx.notificationService.SendIfNeeded(evalCtx)
require.NoError(t, err)
require.Equalf(t, 1, scenarioCtx.renderCount, "expected render to be called, but wasn't")
require.Equalf(t, 0, scenarioCtx.imageUploadCount, "expected image not to be uploaded, but it was")
require.Truef(t, evalCtx.Ctx.Value("notificationSent").(bool), "expected notification to be sent, but wasn't")
})
}
......@@ -53,6 +112,8 @@ type scenarioContext struct {
notificationService *notificationService
imageUploadCount int
renderCount int
uploadProvider func(ctx context.Context, path string) (string, error)
renderProvider func(ctx context.Context, opts rendering.Opts) (*rendering.RenderResult, error)
}
type scenarioFunc func(c *scenarioContext)
......@@ -100,14 +161,26 @@ func notificationServiceScenario(t *testing.T, name string, evalCtx *EvalContext
return nil
})
setting.AlertingNotificationTimeout = 30 * time.Second
scenarioCtx := &scenarioContext{
evalCtx: evalCtx,
}
uploadProvider := func(ctx context.Context, path string) (string, error) {
scenarioCtx.imageUploadCount++
return "", nil
}
imageUploader := &testImageUploader{
uploadProvider: func(ctx context.Context, path string) (string, error) {
scenarioCtx.imageUploadCount++
return "", nil
if scenarioCtx.uploadProvider != nil {
if _, err := scenarioCtx.uploadProvider(ctx, path); err != nil {
return "", err
}
}
return uploadProvider(ctx, path)
},
}
......@@ -119,10 +192,20 @@ func notificationServiceScenario(t *testing.T, name string, evalCtx *EvalContext
newImageUploaderProvider = origNewImageUploaderProvider
}()
renderProvider := func(ctx context.Context, opts rendering.Opts) (*rendering.RenderResult, error) {
scenarioCtx.renderCount++
return &rendering.RenderResult{FilePath: "image.png"}, nil
}
renderService := &testRenderService{
renderProvider: func(ctx context.Context, opts rendering.Opts) (*rendering.RenderResult, error) {
scenarioCtx.renderCount++
return &rendering.RenderResult{FilePath: "image.png"}, nil
if scenarioCtx.renderProvider != nil {
if _, err := scenarioCtx.renderProvider(ctx, opts); err != nil {
return nil, err
}
}
return renderProvider(ctx, opts)
},
}
......
......@@ -60,6 +60,7 @@ func (rs *RenderingService) renderViaHttp(ctx context.Context, opts Opts) (*Rend
req.Header.Set("User-Agent", fmt.Sprintf("Grafana/%s", setting.BuildVersion))
// gives service some additional time to timeout and return possible errors.
reqContext, cancel := context.WithTimeout(ctx, opts.Timeout+time.Second*2)
defer cancel()
......
......@@ -63,6 +63,7 @@ func (rs *RenderingService) renderViaPhantomJS(ctx context.Context, opts Opts) (
cmdArgs = append([]string{fmt.Sprintf("--output-encoding=%s", opts.Encoding)}, cmdArgs...)
}
// gives phantomjs some additional time to timeout and return possible errors.
commandCtx, cancel := context.WithTimeout(ctx, opts.Timeout+time.Second*2)
defer cancel()
......
......@@ -3,6 +3,7 @@ package rendering
import (
"context"
"fmt"
"time"
pluginModel "github.com/grafana/grafana-plugin-model/go/renderer"
)
......@@ -22,7 +23,8 @@ func (rs *RenderingService) renderViaPlugin(ctx context.Context, opts Opts) (*Re
return nil, err
}
ctx, cancel := context.WithTimeout(ctx, opts.Timeout)
// gives plugin some additional time to timeout and return possible errors.
ctx, cancel := context.WithTimeout(ctx, opts.Timeout+time.Second*2)
defer cancel()
req := &pluginModel.RenderRequest{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment