Commit 6002df58 by Carl Bergquist Committed by GitHub

Add monitoring mixing for Grafana (#28285)

Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
parent febdad4d
alerts.yaml
rules.yaml
dashboards_out
\ No newline at end of file
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
all: fmt lint build clean
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
lint:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
mixtool lint mixin.libsonnet
build:
mixtool generate all mixin.libsonnet
clean:
rm -rf dashboards_out alerts.yaml rules.yaml
\ No newline at end of file
# Grafana Mixin
_This is a work in progress. We aim for it to become a good role model for alerts
and dashboards eventually, but it is not quite there yet._
The Grafana Mixin is a set of configurable, reusable, and extensible alerts and
dashboards based on the metrics exported by Grafana. The mixin creates
recording and alerting rules for Prometheus and suitable dashboard descriptions
for Grafana.
To use them, you need to have `mixtool` and `jsonnetfmt` installed. If you
have a working Go development environment, it's easiest to run the following:
```bash
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt
```
You can then build the Prometheus rules files `alerts.yaml` and
`rules.yaml` and a directory `dashboard_out` with the JSON dashboard files
for Grafana:
```bash
$ make build
```
For more advanced uses of mixins, see
https://github.com/monitoring-mixins/docs.
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
for: 5m
expr: |
100 * namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query", statuscode=~"5.."}
/
namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query"}
> 0.5
labels:
severity: 'critical'
annotations:
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors"
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 35,
"iteration": 1602761142538,
"links": [],
"panels": [
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
}
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Firing Alerts",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
}
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Dashboards",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {
"align": null
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 12,
"x": 12,
"y": 0
},
"id": 10,
"options": {
"showHeader": true
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Build Info",
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"branch": true,
"container": true,
"goversion": true,
"namespace": true,
"pod": true,
"revision": true
},
"indexByName": {
"Time": 7,
"Value": 11,
"branch": 4,
"container": 8,
"edition": 2,
"goversion": 6,
"instance": 1,
"job": 0,
"namespace": 9,
"pod": 10,
"revision": 5,
"version": 3
},
"renameByName": {}
}
}
],
"type": "table"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 5
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (statuscode) (irate(http_request_total{job=~\"$job\", instance=~\"$instance\"}[1m])) ",
"interval": "",
"legendFormat": "{{statuscode}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "RPS",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:157",
"format": "reqps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:158",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 5
},
"hiddenSeries": false,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"})",
"interval": "",
"legendFormat": "max-99th",
"refId": "A"
},
{
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.9\"})",
"interval": "",
"legendFormat": "max-90th",
"refId": "B"
},
{
"expr": "sum(irate(http_request_duration_milliseconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__interval])) / sum(irate(http_request_duration_milliseconds_count{job=~\"$job\", instance=~\"$instance\"}[$__interval])) ",
"interval": "",
"legendFormat": "avg",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Request Latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:210",
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:211",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 25,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "prometheus",
"value": "prometheus"
},
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": {
"selected": true,
"tags": [],
"text": "All",
"value": [
"$__all"
]
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, job)",
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "job",
"options": [],
"query": "label_values(grafana_build_info, job)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, instance)",
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "instance",
"options": [],
"query": "label_values(grafana_build_info, instance)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Grafana Overview",
"uid": "6be0s85Mk",
"version": 4
}
\ No newline at end of file
{
grafanaDashboards: {
'grafana-overview.json': (import 'dashboards/grafana-overview.json'),
},
// Helper function to ensure that we don't override other rules, by forcing
// the patching of the groups list, and not the overall rules object.
local importRules(rules) = {
groups+: std.native('parseYaml')(rules)[0].groups,
},
prometheusRules+: importRules(importstr 'rules/rules.yaml'),
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'),
}
groups:
- name: grafana_rules
rules:
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests
- record: namespace_job_handler_statuscode:http_request_total:rate5m
expr: |
sum by (namespace, job, handler, statuscode) (rate(http_request_total[5m]))
......@@ -224,6 +224,7 @@ def lint_backend_step(edition):
'revive -formatter stylish -config scripts/go/configs/revive.toml ./pkg/...',
'./scripts/revive-strict',
'./scripts/tidy-check.sh',
'./scripts/mixin-check.sh,
],
}
......
#!/bin/bash
set -eo pipefail
cd grafana-mixin
go install github.com/monitoring-mixins/mixtool/cmd/mixtool
go install github.com/google/go-jsonnet/cmd/jsonnetfmt
make lint build
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment