пʼятницю, 11 липня 2014 р.

Scripting Grafana dashboards


One day I came across Grafana and found it very attractive.
It's great usability and responsive design have quickly made it my favorite.
Templated dashboards feature is very handy.
But real power comes with scripted dashboards. It is possible to do async calls to external controller, get some JSON data, and construct dashboard object dynamically, based on the data received.

There were no good example or howto, though, so

I've got some work to do to understand how can I use it.


  • I have hundreds of servers, and I want to view a pre-defined dashboard for any server instance. Server configurations are different, each server type runs different processes, has different number of disks and network interfaces.
  • I don't want to select server name from long drop-down filter list,
  • I don't want to create a separate dashboard template for every server type,
  • I do not want to edit template each time I change server configuration (add more volumes, for example).
  • I'd like to define filters dynamically


Define some helper functions first.
find_filter_values(), expand_filter_values() - used to query external services for data. Here I query Graphite for metrics available, but it could easily be a call to Zookeeper/Sensu/RabbitMQ/etc.


// accessible variables in this scope
var window, document, ARGS, $, jQuery, moment, kbn;

// use defaults for URL arguments
var arg_env  = 'prod';
var arg_i    = 'default';
var arg_span = 4;
var arg_from = '2h';

  // return dashboard filter_list
  // optionally include 'All'
  function get_filter_object(name,query,show_all){
    show_all = (typeof show_all === "undefined") ? true : show_all;
    var arr = find_filter_values(query);
    var opts = [];
    for (var i in arr) {
      opts.push({"text":arr[i], "value":arr[i]});
    };
    if (show_all == true) {
      opts.unshift({"text":"All", "value": '{'+arr.join()+'}'});
    };
    return {
      type: "filter",
      name: name,
      query: query,
      options: opts,
      current: opts[0],
      includeAll: show_all
    }
  };

  // execute graphite-api /metrics/find query
  // return array of metric last names ( func('test.cpu-*') returns ['cpu-0','cpu-1',..] )
  function find_filter_values(query){
    var search_url = window.location.protocol + '//' + window.location.host + '/_graphite/metrics/find/?query=' + query;
    var res = [];
    var req = new XMLHttpRequest();
    req.open('GET', search_url, false);
    req.send(null);
    var obj = JSON.parse(req.responseText);
    for(var key in obj) {
      if (obj[key].hasOwnProperty("text")) {
        res.push(obj[key]["text"]);
      }
    }
    return res;
  };

  // execute graphite-api /metrics/expand query
  // return array of metric full names (func('*.cpu-*') returns ['test.cpu-0','test.cpu-1',..] )
  function expand_filter_values(query){
    var search_url = window.location.protocol + '//' + window.location.host + '/_graphite/metrics/expand/?query=' + query;
    var req = new XMLHttpRequest();
    req.open('GET', search_url, false);
    req.send(null);
    var obj = JSON.parse(req.responseText);
    if (obj.hasOwnProperty('results')) {
      return obj['results']; 
    } else { 
      return [];
    };
  };

  // used to calculate aliasByNode index in panel template
  function len(prefix){
    return prefix.split('.').length - 2;
  };

Define panel templates (for cpu, memory, etc).
(Metric prefixes may vary, use helper len() to make aliasByNode() usage easier).


  function panel_collectd_cpu(title,prefix){
    var idx = len(prefix);
    return {
      title: title,
      type: 'graphite',
      span: arg_span,
      renderer: "flot",
      y_formats: ["none"],
      grid: {max: null, min: 0},
      lines: true,
      fill: 1,
      linewidth: 1,
      stack: true,
      legend: {show: true},
      percentage: true,
      nullPointMode: "null",
      tooltip: {
        value_type: "individual",
        query_as_alias: true
      },
      targets: [
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-user,0)),'user')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-system,0)),'system')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-idle,0)),'idle')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-wait,0)),'wait')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-steal,0)),'steal')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-nice,0)),'nice')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-softirq,0)),'irq')" },
        { "target": "alias(sumSeries(nonNegativeDerivative(" + prefix + "[[instance]].cpu-*.cpu-interrupt,0)),'intrpt')" },
      ],
      aliasColors: {
        "user": "#508642",
        "system": "#EAB839",
        "wait": "#890F02",
        "steal": "#E24D42",
        "idle": "#6ED0E0"
      }
    }
  };

  function panel_collectd_network_octets(title,prefix,intrf){
    intrf = (typeof intrf === "undefined") ? 'interface-eth0' : intrf;
    var idx = len(prefix);
    return {
      title: title + ', ' + intrf,
      type: 'graphite',
      span: arg_span,
      y_formats: ["bytes"],
      grid: {max: null, min: null},
      lines: true,
      fill: 1,
      linewidth: 2,
      nullPointMode: "null",
      targets: [
        { "target": "aliasByNode(movingMedian(nonNegativeDerivative(keepLastValue(" + prefix + "[[instance]]." + intrf + ".if_octets.rx,10),0),'5min')," +(idx+4)+ ")" },
        { "target": "aliasByNode(movingMedian(scale(nonNegativeDerivative(keepLastValue(" + prefix + "[[instance]]." + intrf + ".if_octets.tx,10),0),-1),'5min')," +(idx+4)+ ")" }
      ]
    }
  };

Define rows (made of panels).
For network row, query external service to discover all available network interfaces and add necessary panels.

  function row_cpu_memory(title,prefix){
    return {
      title: title,
      height: '250px',
      collapse: false,
      panels: [
        panel_collectd_cpu('CPU, %',prefix),
        panel_collectd_memory('Memory',prefix),
        panel_collectd_loadavg('Load avg, 10min',prefix)
      ]
    }
  };


  function row_network(title,prefix,filter){
    var interfaces = find_filter_values(filter + '.interface-*');
    var panels_network = [];
    for (var i in interfaces) {
      panels_network.push(
                          panel_collectd_network_octets('network octets',prefix,interfaces[i]),
                          panel_collectd_network_packets('network packets',prefix,interfaces[i])
                        );
    };
    return {
      title: title,
      height: '250px',
      collapse: true,
      panels: panels_network
    }
  };

Finally, a callback function to build whole dashboard from the pieces above.
Query external service(s) to check what optional rows to append to the dashboard. Build one filter object dynamically, and add it to dashboard. Note I pass 'false' in get_filter_object("instance",arg_filter,false)'. If I don't - I get filter 'Include All' option enabled and selected automatically.


return function(callback) {

  // Setup some variables
  var dashboard, timspan;
  
  if(!_.isUndefined(ARGS.span)) {
    arg_span = ARGS.span;
  }

  if(!_.isUndefined(ARGS.env)) {
    arg_env = ARGS.env;
  }

  if(!_.isUndefined(ARGS.i)) {
    arg_i = ARGS.i;
  }

  if(!_.isUndefined(ARGS.from)) {
    arg_from = ARGS.from;
  }

 
  /* prefix - depends on actual Graphite tree. 
       In my case it depends on environment which can be passed as argument too.
      < env >.collectd.hosts.< hostname >.< metric >
      < env >.statsd.hosts.< hostname >.< metric >
      < env >.jmxtrans.hosts.< hostname >.< metric >
      < env >.putin.hujlo.< hostname >.la-la-la.< metric >
  */

  var prefix = arg_env + '.collectd.hosts.';
   
  var arg_filter = prefix + arg_i;

  // set filter
  var dashboard_filter = {
    time: {
      from: "now-" + arg_from,
      to: "now"
    },
    list: [
      get_filter_object("instance",arg_filter,false)
      ]
  };

  // define pulldowns
  pulldowns = [
    {
      type: "filtering",
      collapse: false,
      notice: false,
      enable: true
    },
    {
      type: "annotations",
      enable: false
    }
  ];

  // Intialize a skeleton with nothing but a rows array and service object

  dashboard = {
    rows : [],
    services : {}
  };
  dashboard.title = arg_i + ' (' + arg_env + ')';
  dashboard.editable = true;
  dashboard.pulldowns = pulldowns;
  dashboard.services.filter = dashboard_filter;


  // custom dashboard rows (appended to the default dashboard rows)

  var optional_rows = [];


  // If there are any JMX metrics available for this host - append them to dashboard

  var proc_metrics = expand_filter_values(arg_env + '.jmxtrans.hosts.' + arg_i + '.proc.*.jmx');

  for (var m in proc_metrics) {

    proc_name = proc_metrics[m].split('.')[5];

    var jmx_prefix = arg_env + '.jmxtrans.hosts.[[instance]].proc.' + proc_name + '.jmx.';

    optional_rows.push(
                row_delimiter(proc_name),
                row_jvm('JVM Heap',jmx_prefix)
              );

    var jetty_metric = find_filter_values(arg_env + '.jmxtrans.hosts.' + arg_i + '.proc.' + proc_name + '.jmx.jetty_threads');

    if (jetty_metric.length > 0) {
      optional_rows.push(row_jetty('Jetty',jmx_prefix));
    };
  
  };

  $.ajax({
    method: 'GET',
    url: '/'
  })
  .done(function(result) {

    // construct dashboard rows
    
    dashboard.rows.push(
                      row_cpu_memory('cpu, memory',prefix),
                      row_swap('swap',prefix),
                      row_network('network',prefix,arg_filter),
                      row_disk_space('disk space',prefix,arg_filter),
                      row_disk_usage('disk ops',prefix,arg_filter)
                    );

    // custom rows
    for (var i in optional_rows){
      dashboard.rows.push(optional_rows[i]);
    };

    // when dashboard is composed call the callback
    // function and pass the dashboard
    callback(dashboard);
  });
}


Full code Gist is available.
-

Host: i-abcd1234.

It has number of network interfaces and storage volumes attached.
At the moment, it is running several Java applications: Collector and Analyzer.

Collectd metrics:
  'stage.collectd.hosts.i-abcd1234.cpu-*'

Jmxtrans metrics for process 'Collector' (backend process):
  'stage.jmxtrans.hosts.i-abcd1234.proc.collector.jmx.memory.*'
  'stage.jmxtrans.hosts.i-abcd1234.proc.collector.jmx.memorypool.*'
  'stage.jmxtrans.hosts.i-abcd1234.proc.collector.jmx.gc.*'

, and for 'Analyzer' (jetty web app):

  'stage.jmxtrans.hosts.i-abcd1234.proc.analyzer.jmx.{memorypool,gc}.*'
  'stage.jmxtrans.hosts.i-abcd1234.proc.analyzer.jmx.jetty_threads.*



Open dashboard by specifying instance name as URL-argument :
 http://grafana/#/dashboard/script/instance.js?env=stage&i=i-abcd1234




I can see optional panels (Collector and Analyzer) added, so I immediately know what processes this instance is running. (Here I use Graphite as my 'database of truth' to get idea about running processes and metrics available).



I can expand Network row to view network interfaces (if there were 4 interface s - it would show them all, without any action from my side):



, the same for disk space/ops :


, and here is JVM statistics for Collector process:
 


It is possible to pass regexp instead of argument (and get more than one item in filter dropdown list).
This:
http://grafana/#/dashboard/script/instance.js?env=stage&i=www-*
, or even this:
http://grafana/#/dashboard/script/instance.js?env=stage&i={i-abcd1234,i-efgh5678}
 nice, isn't it ?


If I need wider graphs I can change default panel width by passing additional argument &span= : http://grafana/#/dashboard/script/instance.js?env=stage&i=i-abcd1234&span=6


In this example, I have lots of graphs at the dashboard, and it does not slow down dashboard rendering. Well, it does, but I usually collapse less important rows to avoid unnecessary rendering. This way I can select 'Last 30 days' in Grafana's timepicker and not get angry waiting for rendering 10 panels completed.


I don't like to append &env parameter for any non-default environment, so most probably, next step to improve would be to add another one JS function to determine the environment  automatically (by querying my external 'database of truth').

Of course, this scripted dashboard can be refactored/optimized even more, but my goal here is to keep it easy to understand and easy to maintain/modify.


8 коментарів:

Alexandre DErumier сказав...

THANKS YOU FOR THIS SCRIPT !

I WAS LOOKING FOR THIS SINCE MONTHS !

Time to move from cacti :)

Анонім сказав...

Thanks so much for this script!

We use StoreRates and SeparateInstances in our collectd write_graphite config, so I needed to adjust for our needs.

Here's a gist of that (simplified, no jmxtrans) script if needed:

https://gist.github.com/mikepea/07b1cede92c119e4f297

Dieter_be сказав...

nice approach, but this way you always have to know what you're looking for and enter it in the URL, right?
would be nice if there was a way to get an interface to search instances, with autocomplete etc. since grafana already uses this kind of widgets for the dashboard search, maybe we could somehow reuse that widget :)

Anatoliy D. сказав...

Hi Dieter_be,
it is less about Grafana itself, it's more about how to make Grafana dashboards even more 'dynamic' and how to integrate it with your existing infrastructure.

Here is the use case:
we have a very dynamic environment, with new servers scaling up and down every minute. Typically, I don't even want to know about any single server existence until the real problem happens :)
In case of event, our monitoring system sends me a message with a description and a link to a specific server in trouble : http://grafana/#/dashboard/script/instance.js?i=i-abcd1234
I just click the link and I see all the data for this specific server data immediately, no search, no URL change. I can further post this link to Operations chat, and they will need just to click, simple, doing no search, no typing.

Dieter_be сказав...

Hey Anatoliy,
I understand.
But in my case I want to build similar dynamic dashboards and I don't have an app that points to prepopulated urls, because it's not an alerting use case. For example let's say I have a 100 different versions of the same website (for 100 different communities). And I want to build a dashboard that shows stats of a given community. At some point I'll say "I wonder how site 'foobar' is doing, let me open up its dashboard". It would be nice if we could reuse the grafana widgets so i could build a little search/filtering widget that i can populate with values (the site names), so i can easily search and narrow down to the one i'm looking for, and then load the correct dashboard. sure i could do this in an external app, but it's nicer to keep it in 1 app, especially since grafana already has the widgets in place. i'll request this as a feature for grafana, to make the search widget more reusable (or mabye it already is, haven't tried yet)

patricia сказав...

What a nice website. With good information and nice people.

Sid сказав...

Hello,

This has been helpful for us to understand the concept clearly. I need to understand how to access the script. I tried but it fails with 404 error. Could you please help out with this.I need to know how to access the scripts. What needs to pass in the url?

Thanks a lot in advance.

Libby сказав...

Hi,
links don't work anymore. Could you fix it?
Thank you