[collectd] Collectd and FreeBSD restart behavior + df and zfs file system.

Day, David dday at redcom.com
Wed Feb 3 21:26:14 CET 2016


Hi,

Some observations from the trenches:
Please file, post and/or suggest redirection.
Overall pleased with the tool and it's flexibility.

I am using collectd V5.5 under FreeBSD V10.2.

When ever I "sudo service collectd restart" I observe the following 2 behaviors.

1.) collectd core dumps.

2016-02-03T19:49:02.913918-05:00 alabama kern info kernel:pid 80434 (collectd), uid 0: exited on signal 11 (core dumped)   ( Actually this happens as well if the service command is simply "stop"

2.) As the restart occurs on just about every plugin I have configured there will be an rrdtool complaint in the syslog  "illegal attempt to update ....."
e.g.  

2016-02-03T19:48:02.694521-05:00 alabama daemon warning collectd[80434]: rrdtool plugin: rrd_update_r (/var/db/collectd/rrd/localhost/memory/memory-active.rrd) failed: /var/db/collectd/rrd/localhost/memory/memory-active.rrd: illegal attempt to update using time 1454546882 when last update time is 1454546882 (minimum one second step)
2016-02-03T19:48:02.695166-05:00 alabama daemon warning collectd[80434]: rrdtool plugin: rrd_update_r (/var/db/collectd/rrd/localhost/memory/memory-cache.rrd) failed: /var/db/collectd/rrd/localhost/memory/memory-cache.rrd: illegal attempt to update using time 1454546882 when last update time is 1454546882 (minimum one second step)
2016-02-03T19:48:12.671130-05:00 alabama daemon warning collectd[80434]: rrdtool plugin: rrd_update_r (/var/db/collectd/rrd/localhost/load/load.rrd) failed: /var/db/collectd/rrd/localhost/load/load.rrd: illegal attempt to update using time 1454546882 when last update time is 1454546882 (minimum one second step)

My work-around for this is to avoid the service collectd restart option. 
Something like the following avoids the second observation. The sleep step is critical.

service collectd stop;sleep 2; service collectd start.

I suspect this may actually be more of a matter for the FreeBSD mailing lists?

As a separate  application matter I found that the df plugin did a poor job of dealing with a zfs filesystem in calculating used capacity and reporting to the threshold plugin.  I ended up writing an exec extension that used zpool to emulate what I was hoping df would generate for me.




Best Regards


Dave Day

 
For completeness here s is my collectd.conf file.

cat /usr/local/etc/collectd.conf
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/

##############################################################################
# Global                                                                     #
#----------------------------------------------------------------------------#
# Global settings for the daemon.                                            #

Hostname    "localhost"
FQDNLookup   true
BaseDir     "/var/db/collectd"
PIDFile     "/var/run/collectd.pid"
TypesDB     "/usr/local/share/collectd/types.db"

#----------------------------------------------------------------------------#
# When enabled, plugins are loaded automatically with the default options    #
# when an appropriate <Plugin ...> block is encountered.                     #
# Disabled by default.                                                       #
#----------------------------------------------------------------------------#
#AutoLoadPlugin false

#----------------------------------------------------------------------------#
# Interval at which to query values. This may be overwritten on a per-plugin #
# base by using the 'Interval' option of the LoadPlugin block:               #
#   <LoadPlugin foo>                                                         #
#       Interval 60                                                          #
#   </LoadPlugin>                                                            #
#----------------------------------------------------------------------------#
#Interval     10

#Timeout      2
#ReadThreads  5
#WriteThreads 5

# Limit the size of the write queue. Default is no limit. Setting up a limit is
# recommended for servers handling a high volume of traffic.
#WriteQueueLimitHigh 1000000
#WriteQueueLimitLow   800000

# config file assumes collectd V5.5
#A value is identified by a unique name, which we usually call The "identifier" consists of five parts, two of which are optional:
#. host "/" plugin ["-" plugin instance] "/" type ["-" type instance]
#  [/var/db/collectd/rrd/]localhost/cpu-0/cpu-idle

##############################################################################
# Logging                                                                    #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log     #
# messages generated when loading or configuring other plugins can be        #
# accessed.                                                                  #
##############################################################################

LoadPlugin syslog
<plugin syslog>
  LogLevel warning
  NotifyLevel "OKAY"
</plugin>

LoadPlugin cpu
LoadPlugin unixsock

<Plugin cpu>
  ReportByState true  # separate database for time of  system,idle,user,nice,interrupt
  ReportByCpu false   # aggregate across cores.
  ValuesPercentage true
</Plugin>

LoadPlugin exec
<Plugin exec>
        Exec "www:www" "/usr/local/sbin/collectd_zpool"
        Exec "www:www" "/usr/local/sbin/collectd_coretemp"
        Exec "www:www" "/usr/local/sbin/collectd_smart.sh"
        Exec "www:www" "/usr/local/sbin/collectd_superio.sh"
</Plugin>


LoadPlugin interface
LoadPlugin load
LoadPlugin memory
LoadPlugin match_regex

LoadPlugin postgresql
<plugin postgresql>
 <Query active_calls>
  Statement "SELECT COALESCE((SELECT COUNT(*) FROM log.pep_connection),0) as active_calls;"
  <Result>
     Type gauge
     InstancePrefix "active_calls"
     ValuesFrom active_calls
  </Result>
 </Query>


 <Database ace_db>
   Host "localhost"
   Port "5432"
   User "collectd"
   Query active_calls
 </Database>
</plugin>

LoadPlugin processes

LoadPlugin rrdtool

<plugin rrdtool>
  DataDir "/var/db/collectd/rrd"
</plugin>
LoadPlugin swap
LoadPlugin threshold
LoadPlugin tail


# count the msgs per facilitiy at warning or above.
<plugin "tail">
 <File "/var/log/messages">
  Instance "messages"
  <Match>
# localhost/tail-messages/counter-ace
   Regex "local1.(err|warn|alert|crit)"
   DSType "CounterInc"
   Type "counter"
   Instance "ace"
  </Match>
  <Match>
   Regex "local0.(err|warn|alert|crit)"
   ExcludeRegex "smdr:"
   DSType "CounterInc"
   Type "counter"
   Instance "postgres"
  </Match>
  <Match>
   Regex "local4.(err|warn|alert|crit)"
   DSType "CounterInc"
   Type "counter"
   Instance "mec"
  </Match>
  <Match>
   Regex "local5.(err|warn|alert|crit)"
   DSType "CounterInc"
   Type "counter"
   Instance "web"
  </Match>
  <Match>
   Regex "(local6|local7).(err|warn|alert|crit)"
   DSType "CounterInc"
   Type "counter"
   Instance "apache"
  </Match>
   <Match>
   Regex "^.*$"
   ExcludeRegex " local[0-7] "
   DSType "CounterInc"
   Type "counter"
   Instance "os"
  </Match>
 </File>
</plugin>

<Plugin unixsock>
   SocketFile "/var/run/collectd-unixsock"
   SocketPerms "0660"
   DeleteSocket false
</Plugin>


#https://collectd.org/documentation/manpages/collectd-threshold.5.shtml
<Plugin "threshold">

   <plugin "cpu">
       <type "percent">
         Instance "idle"
         WarningMin 50
         FailureMin 5
         Hits 3
       </type>
   </plugin>

   <Plugin "memory">
     <Type "memory">
         Instance "free"
         WarningMin 100000000
     </Type>
   </Plugin>

   <Plugin "swap">
     <Type "swap">
         Instance "free"
         WarningMin 100000000
     </Type>
   </Plugin>

   <plugin "load">
    <type "load">
     DataSource "midterm"
     FailureMax 4
     Hits 3
     Hysteresis 3
    </type>
   </plugin>

   <Plugin "tail">
    Instance "messages"
    <type "counter">
     Instance "os"
     WarningMax 5
    </type>
    <type "counter">
     Instance "ace"
     WarningMax 5
    </type>
    <type "counter">
     Instance "apache"
     WarningMax 5
    </type>
    <type "counter">
     Instance "web"
     WarningMax 5
    </type>
    <type "counter">
     Instance "postgres"
     WarningMax 5
    </type>
    <type "counter">
     Instance "mec"
     WarningMax 5
    </type>
   </Plugin>

   <plugin "exec">
    Instance "coretemp"
    <type "temperature">
     Instance "avg"
     WarningMax 80
    </type>
   </plugin>

   <plugin "exec">
    Instance "zpool"
    <type "gauge">
     Instance "val"
     WarningMax 75
     FailureMax 90
    </type>
   </plugin>

   <plugin "exec">
    Instance "superio"
    <type "temperature">
     Instance "cpu"
     WarningMax 80
    </type>
    <type "temperature">
     Instance "system"
     WarningMax 80
    </type>
   </plugin>

 </Plugin>






More information about the collectd mailing list