Grafana

The most popular open-source monitoring and visualization platform. Features dynamic dashboards, machine learning integration, and diverse data source support. Recognized as a Gartner Magic Quadrant Leader.

Monitoring ServerVisualization PlatformDashboardAnalyticsOpen SourceTime SeriesMulti-tenancy

Server

Grafana

Overview

Grafana is the most popular open-source monitoring and visualization platform. It features dynamic dashboards, machine learning integration, and diverse data source support, recognized as a Gartner Magic Quadrant Leader. With an overwhelming 94% adoption rate and high rating of 4.6/5.0, it continues evolving with AI-powered insights and observability as code.

Details

Grafana maintains an overwhelming market share with a 94% adoption rate and has been recognized as a Gartner Magic Quadrant Leader with a high rating of 4.6/5.0. It continues evolving with AI-powered insights and observability as code capabilities. The platform supports over 150 data source integrations and serves millions of users worldwide. As both an open-source project and commercial offering, Grafana provides enterprise-grade features while maintaining its community-driven development approach.

Key Technical Features

Dynamic Dashboards: Interactive and customizable visualization panels
150+ Data Sources: Support for diverse databases, APIs, and monitoring systems
AI-Powered Insights: Machine learning integration for anomaly detection
Multi-tenancy: Organization and team-based access control
Alerting System: Flexible alerting with multiple notification channels
Plugin Ecosystem: Extensive plugin architecture for extensibility

Use Cases

Infrastructure and application monitoring
Business intelligence and analytics
IoT data visualization
Security monitoring and compliance
Performance analysis and optimization
Real-time operational dashboards

Pros and Cons

Pros

Market Leader: Industry-standard visualization platform
Rich Data Source Support: 150+ integrations out of the box
Active Community: Large community with extensive resources
Flexible Dashboards: Highly customizable visualization options
Enterprise Features: Advanced authentication, provisioning, and scaling
Cloud and On-premise: Flexible deployment options

Cons

Learning Curve: Complex configuration for advanced features
Resource Usage: Can be memory-intensive with large datasets
Dashboard Sprawl: Risk of unmanaged dashboard proliferation
Query Complexity: Requires knowledge of data source query languages
Version Compatibility: Breaking changes between major versions
Plugin Dependencies: Third-party plugin reliability concerns

Reference Pages

Code Examples

Installation and Basic Setup

# Docker installation
docker run -d \
  --name grafana \
  -p 3000:3000 \
  -e "GF_SECURITY_ADMIN_PASSWORD=admin123" \
  grafana/grafana:latest

# Using Docker Compose
cat > docker-compose.yml << EOF
version: '3.8'
services:
  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-worldmap-panel
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana.ini:/etc/grafana/grafana.ini
    restart: unless-stopped

volumes:
  grafana-data:
EOF

# Package installation (Ubuntu/Debian)
sudo apt-get install -y adduser libfontconfig1
wget https://dl.grafana.com/enterprise/release/grafana-enterprise_10.2.0_amd64.deb
sudo dpkg -i grafana-enterprise_10.2.0_amd64.deb

# Start service
sudo systemctl enable grafana-server
sudo systemctl start grafana-server

# Check status
sudo systemctl status grafana-server

Configuration File

# grafana.ini
[default]
instance_name = grafana-production

[paths]
data = /var/lib/grafana
logs = /var/log/grafana
plugins = /var/lib/grafana/plugins
provisioning = /etc/grafana/provisioning

[server]
protocol = http
http_addr = 0.0.0.0
http_port = 3000
domain = grafana.example.com
enforce_domain = false
root_url = https://grafana.example.com/
serve_from_sub_path = false
static_root_path = public
enable_gzip = true
cert_file = /etc/ssl/certs/grafana.crt
cert_key = /etc/ssl/private/grafana.key

[database]
type = postgres
host = postgres.example.com:5432
name = grafana
user = grafana
password = your_password_here
ssl_mode = require
max_open_conn = 300
max_idle_conn = 300
conn_max_lifetime = 14400
log_queries = false

[security]
admin_user = admin
admin_password = $__env{GF_SECURITY_ADMIN_PASSWORD}
secret_key = your_secret_key_here
login_remember_days = 7
cookie_username = grafana_user
cookie_remember_name = grafana_remember
disable_gravatar = true
data_source_proxy_whitelist = prometheus.example.com:9090,elasticsearch.example.com:9200
disable_brute_force_login_protection = false
cookie_secure = true
cookie_samesite = strict
allow_embedding = false
strict_transport_security = true

[auth]
disable_login_form = false
disable_signout_menu = false
signout_redirect_url = https://example.com/logout
oauth_auto_login = false
api_key_max_seconds_to_live = 86400

[auth.ldap]
enabled = true
config_file = /etc/grafana/ldap.toml
allow_sign_up = false

[smtp]
enabled = true
host = smtp.example.com:587
user = [email protected]
password = your_smtp_password
cert_file = 
key_file = 
skip_verify = false
from_address = [email protected]
from_name = Grafana
ehlo_identity = grafana.example.com
startTLS_policy = MandatoryStartTLS

[alerting]
enabled = true
execute_alerts = true
error_or_timeout = alerting
nodata_or_nullvalues = no_data
concurrent_render_limit = 5
evaluation_timeout_seconds = 30
notification_timeout_seconds = 30
max_attempts = 3

[unified_alerting]
enabled = true
disabled_orgs = 
admin_config_poll_interval = 60s
alertmanager_config_poll_interval = 60s
ha_listen_address = "0.0.0.0:9094"
ha_advertise_address = ""
ha_peers = ""
ha_peer_timeout = 15s
ha_gossip_interval = 200ms
ha_push_pull_interval = 60s

[metrics]
enabled = true
interval_seconds = 10
disable_total_stats = false
basic_auth_username = metrics
basic_auth_password = your_metrics_password

[grafana_net]
url = https://grafana.net

[log]
mode = console file
level = info
filters = rendering:debug

[log.console]
level = info
format = console

[log.file]
level = info
format = text
log_rotate = true
max_lines = 1000000
max_size_shift = 28
daily_rotate = true
max_days = 7

[quota]
enabled = true
org_user = 100
org_dashboard = 1000
org_data_source = 100
org_api_key = 100
user_org = 10
global_user = 1000
global_org = 100
global_dashboard = 10000
global_api_key = 1000
global_session = 10000

[feature_toggles]
enable = publicDashboards,lokiExperimentalStreaming,correlations

Data Source Provisioning

# provisioning/datasources/datasources.yml
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false
    jsonData:
      timeInterval: 30s
      queryTimeout: 60s
      httpMethod: POST
      manageAlerts: true
      prometheusType: Prometheus
      prometheusVersion: 2.40.0
      cacheLevel: High
      incrementalQuerying: true
      disableRecordingRules: false

  - name: Elasticsearch
    type: elasticsearch
    access: proxy
    url: http://elasticsearch:9200
    database: logstash-*
    jsonData:
      esVersion: 70
      timeField: "@timestamp"
      logMessageField: message
      logLevelField: level
      interval: Daily
      maxConcurrentShardRequests: 5
      includeFrozen: false

  - name: PostgreSQL
    type: postgres
    url: postgres:5432
    database: monitoring
    user: grafana
    secureJsonData:
      password: your_password_here
    jsonData:
      sslmode: require
      maxOpenConns: 100
      maxIdleConns: 100
      connMaxLifetime: 14400
      postgresVersion: 1300
      timescaledb: false

  - name: InfluxDB
    type: influxdb
    access: proxy
    url: http://influxdb:8086
    database: telegraf
    user: grafana
    secureJsonData:
      password: your_password_here
    jsonData:
      httpMode: GET
      httpHeaderName1: Authorization
    secureJsonData:
      httpHeaderValue1: Token your_token_here

  - name: CloudWatch
    type: cloudwatch
    jsonData:
      authType: credentials
      defaultRegion: us-east-1
      customMetricsNamespaces: AWS/ApplicationELB,AWS/ELB,AWS/Lambda
      assumeRoleArn: arn:aws:iam::123456789012:role/GrafanaCloudWatchRole
    secureJsonData:
      accessKey: AKIAIOSFODNN7EXAMPLE
      secretKey: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY

  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    jsonData:
      maxLines: 1000
      derivedFields:
        - datasourceUid: jaeger_uid
          matcherRegex: "trace_id=(\\w+)"
          name: TraceID
          url: "$${__value.raw}"

Dashboard Provisioning

# provisioning/dashboards/dashboards.yml
apiVersion: 1

providers:
  - name: 'default'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    editable: true
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards

  - name: 'infrastructure'
    orgId: 1
    folder: 'Infrastructure'
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /etc/grafana/provisioning/dashboards/infrastructure

  - name: 'applications'
    orgId: 1
    folder: 'Applications'
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /etc/grafana/provisioning/dashboards/applications

Advanced Dashboard JSON

{
  "dashboard": {
    "id": null,
    "title": "Infrastructure Overview",
    "tags": ["infrastructure", "monitoring"],
    "timezone": "browser",
    "refresh": "30s",
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "timepicker": {
      "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
      "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
    },
    "templating": {
      "list": [
        {
          "name": "instance",
          "type": "query",
          "datasource": "Prometheus",
          "query": "label_values(up, instance)",
          "refresh": 1,
          "sort": 1,
          "multi": true,
          "includeAll": true,
          "allValue": ".*"
        },
        {
          "name": "job",
          "type": "query",
          "datasource": "Prometheus",
          "query": "label_values(up, job)",
          "refresh": 1,
          "sort": 1,
          "multi": true,
          "includeAll": true
        }
      ]
    },
    "panels": [
      {
        "id": 1,
        "title": "CPU Usage",
        "type": "timeseries",
        "targets": [
          {
            "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$instance\"}[5m])) * 100)",
            "legendFormat": "CPU Usage %",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "color": {"mode": "palette-classic"},
            "custom": {
              "axisLabel": "",
              "axisPlacement": "auto",
              "barAlignment": 0,
              "drawStyle": "line",
              "fillOpacity": 10,
              "gradientMode": "none",
              "hideFrom": {"legend": false, "tooltip": false, "vis": false},
              "lineInterpolation": "linear",
              "lineWidth": 1,
              "pointSize": 5,
              "scaleDistribution": {"type": "linear"},
              "showPoints": "never",
              "spanNulls": false,
              "stacking": {"group": "A", "mode": "none"},
              "thresholdsStyle": {"mode": "off"}
            },
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            }
          }
        },
        "options": {
          "legend": {"calcs": [], "displayMode": "list", "placement": "bottom"},
          "tooltip": {"mode": "single", "sort": "none"}
        },
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
      },
      {
        "id": 2,
        "title": "Memory Usage",
        "type": "timeseries",
        "targets": [
          {
            "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\"} / node_memory_MemTotal_bytes{instance=~\"$instance\"})) * 100",
            "legendFormat": "Memory Usage %",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 80},
                {"color": "red", "value": 95}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
      },
      {
        "id": 3,
        "title": "Network Traffic",
        "type": "timeseries",
        "targets": [
          {
            "expr": "irate(node_network_receive_bytes_total{instance=~\"$instance\",device!=\"lo\"}[5m]) * 8",
            "legendFormat": "{{device}} - Receive",
            "refId": "A"
          },
          {
            "expr": "irate(node_network_transmit_bytes_total{instance=~\"$instance\",device!=\"lo\"}[5m]) * 8",
            "legendFormat": "{{device}} - Transmit",
            "refId": "B"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "bps",
            "custom": {"drawStyle": "line", "lineInterpolation": "linear", "spanNulls": false}
          }
        },
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
      }
    ],
    "annotations": {
      "list": [
        {
          "name": "Deployment",
          "datasource": "Prometheus",
          "enable": true,
          "expr": "increase(deployment_version_change_total[1m])",
          "iconColor": "rgba(255, 96, 96, 1)",
          "tags": ["deployment"]
        }
      ]
    }
  },
  "overwrite": true
}

API Usage and Automation

# grafana_api.py
import requests
import json
from datetime import datetime, timedelta

class GrafanaAPI:
    def __init__(self, base_url, api_key):
        self.base_url = base_url.rstrip('/')
        self.headers = {
            'Authorization': f'Bearer {api_key}',
            'Content-Type': 'application/json'
        }
    
    def create_dashboard(self, dashboard_json):
        """Create a new dashboard"""
        url = f"{self.base_url}/api/dashboards/db"
        response = requests.post(url, headers=self.headers, json=dashboard_json)
        return response.json()
    
    def get_dashboard(self, uid):
        """Get dashboard by UID"""
        url = f"{self.base_url}/api/dashboards/uid/{uid}"
        response = requests.get(url, headers=self.headers)
        return response.json()
    
    def update_dashboard(self, dashboard_json):
        """Update existing dashboard"""
        url = f"{self.base_url}/api/dashboards/db"
        response = requests.post(url, headers=self.headers, json=dashboard_json)
        return response.json()
    
    def delete_dashboard(self, uid):
        """Delete dashboard by UID"""
        url = f"{self.base_url}/api/dashboards/uid/{uid}"
        response = requests.delete(url, headers=self.headers)
        return response.status_code == 200
    
    def search_dashboards(self, query=None, tags=None):
        """Search dashboards"""
        url = f"{self.base_url}/api/search"
        params = {}
        if query:
            params['query'] = query
        if tags:
            params['tag'] = tags
        
        response = requests.get(url, headers=self.headers, params=params)
        return response.json()
    
    def create_data_source(self, data_source_config):
        """Create a new data source"""
        url = f"{self.base_url}/api/datasources"
        response = requests.post(url, headers=self.headers, json=data_source_config)
        return response.json()
    
    def get_data_sources(self):
        """Get all data sources"""
        url = f"{self.base_url}/api/datasources"
        response = requests.get(url, headers=self.headers)
        return response.json()
    
    def create_user(self, user_data):
        """Create a new user"""
        url = f"{self.base_url}/api/admin/users"
        response = requests.post(url, headers=self.headers, json=user_data)
        return response.json()
    
    def create_organization(self, org_data):
        """Create a new organization"""
        url = f"{self.base_url}/api/orgs"
        response = requests.post(url, headers=self.headers, json=org_data)
        return response.json()
    
    def export_dashboard(self, uid, file_path):
        """Export dashboard to JSON file"""
        dashboard = self.get_dashboard(uid)
        
        if 'dashboard' in dashboard:
            with open(file_path, 'w') as f:
                json.dump(dashboard['dashboard'], f, indent=2)
            return True
        return False
    
    def backup_all_dashboards(self, backup_dir):
        """Backup all dashboards"""
        import os
        
        os.makedirs(backup_dir, exist_ok=True)
        dashboards = self.search_dashboards()
        
        for dashboard in dashboards:
            if dashboard['type'] == 'dash-db':
                uid = dashboard['uid']
                title = dashboard['title'].replace('/', '_').replace(' ', '_')
                filename = f"{title}_{uid}.json"
                file_path = os.path.join(backup_dir, filename)
                
                if self.export_dashboard(uid, file_path):
                    print(f"Exported: {title}")
                else:
                    print(f"Failed to export: {title}")

# Usage example
if __name__ == "__main__":
    grafana = GrafanaAPI('http://localhost:3000', 'your_api_key_here')
    
    # Create data source
    prometheus_ds = {
        "name": "Prometheus-API",
        "type": "prometheus",
        "url": "http://prometheus:9090",
        "access": "proxy",
        "basicAuth": False,
        "isDefault": True
    }
    
    result = grafana.create_data_source(prometheus_ds)
    print(f"Data source created: {result}")
    
    # Backup all dashboards
    grafana.backup_all_dashboards('./grafana_backup')

Alerting Configuration

# provisioning/alerting/alerts.yml
apiVersion: 1

groups:
  - name: infrastructure_alerts
    orgId: 1
    folder: alerts
    interval: 1m
    rules:
      - uid: cpu_high
        title: High CPU Usage
        condition: A
        data:
          - refId: A
            queryType: ""
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: prometheus_uid
            model:
              expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
              refId: A
        noDataState: NoData
        execErrState: Alerting
        for: 5m
        annotations:
          description: "CPU usage is above 80% for more than 5 minutes"
          summary: "High CPU usage detected"
        labels:
          team: infrastructure
          severity: warning

      - uid: memory_high
        title: High Memory Usage
        condition: A
        data:
          - refId: A
            queryType: ""
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: prometheus_uid
            model:
              expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
              refId: A
        noDataState: NoData
        execErrState: Alerting
        for: 5m
        annotations:
          description: "Memory usage is above 90%"
          summary: "High memory usage detected"
        labels:
          team: infrastructure
          severity: critical

contactPoints:
  - name: slack_alerts
    receivers:
      - uid: slack_uid
        type: slack
        settings:
          url: https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
          channel: "#alerts"
          username: Grafana
          title: "{{ .GroupLabels.alertname }}"
          text: "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"

notificationPolicies:
  - receiver: slack_alerts
    group_by: ['alertname', 'cluster', 'service']
    group_wait: 10s
    group_interval: 10s
    repeat_interval: 1h
    matchers:
      - name: severity
        value: warning
        operator: "="

Troubleshooting

# Check Grafana status
sudo systemctl status grafana-server

# View logs
sudo journalctl -u grafana-server -f
tail -f /var/log/grafana/grafana.log

# Database connection test
grafana-cli admin reset-admin-password newpassword

# Plugin management
grafana-cli plugins list-remote
grafana-cli plugins install grafana-clock-panel
grafana-cli plugins update-all

# Configuration validation
grafana-server -config /etc/grafana/grafana.ini -v

# API health check
curl -H "Authorization: Bearer your_api_key" http://localhost:3000/api/health

# Database migration
grafana-cli admin migrate

# Clear cache
curl -X POST -H "Authorization: Bearer your_api_key" http://localhost:3000/api/admin/clear-cache

# Memory usage analysis
ps aux | grep grafana
cat /proc/$(pgrep grafana)/status

# Check data source connectivity
curl -H "Authorization: Bearer your_api_key" http://localhost:3000/api/datasources/proxy/1/api/v1/query?query=up

# Performance profiling
curl http://localhost:3000/debug/pprof/goroutine?debug=1