Commit f6b8d3a1 by Marcus Efraimsson

devenv: grafana high availability (ha) test setup

parent 464f3f73
grafana/provisioning/dashboards/alerts/alert-*
\ No newline at end of file
# Grafana High Availability (HA) test setup
A set of docker compose services which together creates a Grafana HA test setup with capability of easily
scaling up/down number of Grafana instances.
Included services
* Grafana
* Mysql - Grafana configuration database and session storage
* Prometheus - Monitoring of Grafana and used as datasource of provisioned alert rules
* Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname
## Prerequisites
### Build grafana docker container
Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev.
```bash
$ cd <grafana repo>
$ make build-docker-full
```
### Virtual host names
#### Alternative 1 - Use dnsmasq
```bash
$ sudo apt-get install dnsmasq
$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null
$ sudo /etc/init.d/dnsmasq restart
$ ping whatever.loc
PING whatever.loc (127.0.0.1) 56(84) bytes of data.
64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms
--- whatever.loc ping statistics ---
1 packet transmitted, 1 received, 0% packet loss, time 1998ms
```
#### Alternative 2 - Manually update /etc/hosts
Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname.
```bash
$ cat /etc/hosts
127.0.0.1 grafana.loc
127.0.0.1 prometheus.loc
```
## Start services
```bash
$ docker-compose up -d
```
Browse
* http://grafana.loc/
* http://prometheus.loc/
Check for any errors
```bash
$ docker-compose logs | grep error
```
### Scale Grafana instances up/down
Scale number of Grafana instances to `<instances>`
```bash
$ docker-compose up --scale grafana=<instances> -d
# for example 3 instances
$ docker-compose up --scale grafana=3 -d
```
## Test alerting
### Create notification channels
Creates default notification channels, if not already exists
```bash
$ ./alerts.sh setup
```
### Slack notifications
Disable
```bash
$ ./alerts.sh slack -d
```
Enable and configure url
```bash
$ ./alerts.sh slack -u https://hooks.slack.com/services/...
```
Enable, configure url and enable reminders
```bash
$ ./alerts.sh slack -u https://hooks.slack.com/services/... -r -e 10m
```
### Provision alert dashboards with alert rules
Provision 1 dashboard/alert rule (default)
```bash
$ ./alerts.sh provision
```
Provision 10 dashboards/alert rules
```bash
$ ./alerts.sh provision -a 10
```
Provision 10 dashboards/alert rules and change condition to `gt > 100`
```bash
$ ./alerts.sh provision -a 10 -c 100
```
### Pause/unpause all alert rules
Pause
```bash
$ ./alerts.sh pause
```
Unpause
```bash
$ ./alerts.sh unpause
```
#!/bin/bash
requiresJsonnet() {
if ! type "jsonnet" > /dev/null; then
echo "you need you install jsonnet to run this script"
echo "follow the instructions on https://github.com/google/jsonnet"
exit 1
fi
}
setup() {
STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://admin:admin@grafana.loc/api/alert-notifications/1)
if [ $STATUS -eq 200 ]; then
echo "Email already exists, skipping..."
else
curl -H "Content-Type: application/json" \
-d '{
"name": "Email",
"type": "email",
"isDefault": false,
"sendReminder": false,
"uploadImage": true,
"settings": {
"addresses": "user@test.com"
}
}' \
http://admin:admin@grafana.loc/api/alert-notifications
fi
STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://admin:admin@grafana.loc/api/alert-notifications/2)
if [ $STATUS -eq 200 ]; then
echo "Slack already exists, skipping..."
else
curl -H "Content-Type: application/json" \
-d '{
"name": "Slack",
"type": "slack",
"isDefault": false,
"sendReminder": false,
"uploadImage": true
}' \
http://admin:admin@grafana.loc/api/alert-notifications
fi
}
slack() {
enabled=true
url=''
remind=false
remindEvery='10m'
while getopts ":e:u:dr" o; do
case "${o}" in
e)
remindEvery=${OPTARG}
;;
u)
url=${OPTARG}
;;
d)
enabled=false
;;
r)
remind=true
;;
esac
done
shift $((OPTIND-1))
curl -X PUT \
-H "Content-Type: application/json" \
-d '{
"id": 2,
"name": "Slack",
"type": "slack",
"isDefault": '$enabled',
"sendReminder": '$remind',
"frequency": "'$remindEvery'",
"uploadImage": true,
"settings": {
"url": "'$url'"
}
}' \
http://admin:admin@grafana.loc/api/alert-notifications/2
}
provision() {
alerts=1
condition=65
while getopts ":a:c:" o; do
case "${o}" in
a)
alerts=${OPTARG}
;;
c)
condition=${OPTARG}
;;
esac
done
shift $((OPTIND-1))
requiresJsonnet
rm -rf grafana/provisioning/dashboards/alerts/alert-*.json
jsonnet -m grafana/provisioning/dashboards/alerts grafana/provisioning/alerts.jsonnet --ext-code alerts=$alerts --ext-code condition=$condition
}
pause() {
curl -H "Content-Type: application/json" \
-d '{"paused":true}' \
http://admin:admin@grafana.loc/api/admin/pause-all-alerts
}
unpause() {
curl -H "Content-Type: application/json" \
-d '{"paused":false}' \
http://admin:admin@grafana.loc/api/admin/pause-all-alerts
}
usage() {
echo -e "Usage: ./alerts.sh COMMAND [OPTIONS]\n"
echo -e "Commands"
echo -e " setup\t\t creates default alert notification channels"
echo -e " slack\t\t configure slack notification channel"
echo -e " [-d]\t\t\t disable notifier, default enabled"
echo -e " [-u]\t\t\t url"
echo -e " [-r]\t\t\t send reminders"
echo -e " [-e <remind every>]\t\t default 10m\n"
echo -e " provision\t provision alerts"
echo -e " [-a <alert rule count>]\t default 1"
echo -e " [-c <condition value>]\t default 65\n"
echo -e " pause\t\t pause all alerts"
echo -e " unpause\t unpause all alerts"
}
main() {
local cmd=$1
if [[ $cmd == "setup" ]]; then
setup
elif [[ $cmd == "slack" ]]; then
slack "${@:2}"
elif [[ $cmd == "provision" ]]; then
provision "${@:2}"
elif [[ $cmd == "pause" ]]; then
pause
elif [[ $cmd == "unpause" ]]; then
unpause
fi
if [[ -z "$cmd" ]]; then
usage
fi
}
main "$@"
version: "2.1"
services:
nginx-proxy:
image: jwilder/nginx-proxy
ports:
- "80:80"
volumes:
- /var/run/docker.sock:/tmp/docker.sock:ro
mysql:
image: mysql
environment:
MYSQL_ROOT_PASSWORD: rootpass
MYSQL_DATABASE: grafana
MYSQL_USER: grafana
MYSQL_PASSWORD: password
healthcheck:
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
timeout: 10s
retries: 10
grafana:
image: grafana/grafana:dev
volumes:
- ./grafana/provisioning/:/etc/grafana/provisioning/
environment:
- VIRTUAL_HOST=grafana.loc
- GF_SERVER_ROOT_URL=http://grafana.loc
- GF_DATABASE_TYPE=mysql
- GF_DATABASE_HOST=mysql:3306
- GF_DATABASE_NAME=grafana
- GF_DATABASE_USER=grafana
- GF_DATABASE_PASSWORD=password
- GF_SESSION_PROVIDER=mysql
- GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(mysql:3306)/grafana?allowNativePasswords=true
ports:
- 3000
depends_on:
mysql:
condition: service_healthy
prometheus:
image: prom/prometheus:v2.4.2
volumes:
- ./prometheus/:/etc/prometheus/
environment:
- VIRTUAL_HOST=prometheus.loc
ports:
- 9090
# mysqld-exporter:
# image: prom/mysqld-exporter
# environment:
# - DATA_SOURCE_NAME=grafana:password@(mysql:3306)/
# ports:
# - 9104
\ No newline at end of file
local numAlerts = std.extVar('alerts');
local condition = std.extVar('condition');
local arr = std.range(1, numAlerts);
local alertDashboardTemplate = {
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"alert": {
"conditions": [
{
"evaluator": {
"params": [
65
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "10s",
"handler": 1,
"name": "bulk alerting",
"noDataState": "no_data",
"notifications": [
{
"id": 2
}
]
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 0
},
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"$$hashKey": "object:117",
"expr": "go_goroutines",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "gt",
"value": 50
}
],
"timeFrom": null,
"timeShift": null,
"title": "Panel Title",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"schemaVersion": 16,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "New dashboard",
"uid": null,
"version": 0
};
{
['alert-' + std.toString(x) + '.json']:
alertDashboardTemplate + {
panels: [
alertDashboardTemplate.panels[0] +
{
alert+: {
name: 'Alert rule ' + x,
conditions: [
alertDashboardTemplate.panels[0].alert.conditions[0] +
{
evaluator+: {
params: [condition]
}
},
],
},
},
],
uid: 'alert-' + x,
title: 'Alert ' + x
},
for x in arr
}
\ No newline at end of file
apiVersion: 1
providers:
- name: 'Alerts'
folder: 'Alerts'
type: file
options:
path: /etc/grafana/provisioning/dashboards/alerts
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"links": [],
"panels": [
{
"aliasColors": {
"Active alerts": "#bf1b00"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 12,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "Active grafana instances",
"dashes": true,
"fill": 0
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)",
"format": "time_series",
"instant": false,
"interval": "1m",
"intervalFactor": 1,
"legendFormat": "Notifications sent",
"refId": "A"
},
{
"expr": "min(grafana_alerting_active_alerts) without(instance)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 1,
"legendFormat": "Active alerts",
"refId": "B"
},
{
"expr": "count(up{job=\"grafana\"})",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Active grafana instances",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Notifications sent vs active alerts",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": 3
}
}
],
"schemaVersion": 16,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Overview",
"uid": "xHy7-hAik",
"version": 6
}
\ No newline at end of file
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
jsonData:
timeInterval: 10s
queryTimeout: 30s
httpMethod: POST
\ No newline at end of file
# my global config
global:
scrape_interval: 10s # By default, scrape targets every 15 seconds.
evaluation_interval: 10s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
#rule_files:
# - "alert.rules"
# - "first.rules"
# - "second.rules"
# alerting:
# alertmanagers:
# - scheme: http
# static_configs:
# - targets:
# - "127.0.0.1:9093"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'grafana'
dns_sd_configs:
- names:
- 'grafana'
type: 'A'
port: 3000
refresh_interval: 10s
# - job_name: 'mysql'
# dns_sd_configs:
# - names:
# - 'mysqld-exporter'
# type: 'A'
# port: 9104
# refresh_interval: 10s
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment