[collectd] rrd_queue_thread issue

Amit Gupta amit.gupta221 at gmail.com
Thu May 14 13:19:13 CEST 2009


Hi,

I am seeing a very strange issue with collectd. I have started
collectd using collectdmon and the collectd is configured to only
monitor cpu and memory. ReadThreads are 5.  Initially when I do a
pstack of the collectd process, I see 7 threads which is expected (5
read threads, 1 main thread, and 1 rrd_queue_cache thread)

# pstack 26921
26921:  /opt/webstack/admin/sbin/collectd -C /etc/opt/webstack/admin/collectd/
-----------------  lwp# 1 / thread# 1  --------------------
 fee44077 nanosleep (8045c68, 8045c68)
 fef92b67 nanosleep (8045c68, 8045c68, 8046888, 8054981) + 1b
 080549f7 do_loop  (0, 0, 0, 0, 0, 1) + d3
 08054ee0 main     (4, 8047d90, 8047da4, feffb818) + 3dc
 0805448d _start   (4, 8047e34, 8047e56, 8047e59, 8047e88, 0) + 7d
-----------------  lwp# 2 / thread# 2  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe260200) + 4e
 fee43d70 _lwp_start (fe260200, 0, 0, fe36eff8, fee43d70, fe260200)
-----------------  lwp# 3 / thread# 3  --------------------
 fee43dcb lwp_park (0, 0, 4)
 fee3e60e cond_wait_queue (807e9d0, 807e9e0, 0, 0) + 3b
 fee3eb07 _cond_wait (807e9d0, 807e9e0) + 66
 fee3eb49 cond_wait (807e9d0, 807e9e0) + 21
 fee3eb82 pthread_cond_wait (807e9d0, 807e9e0, fe25efd8, 8059268) + 1b
 080592fb plugin_read_thread (0) + 10b
 fee43a81 _thr_setup (fe260a00) + 4e
 fee43d70 _lwp_start (fe260a00, 0, 0, fe25eff8, fee43d70, fe260a00)
-----------------  lwp# 4 / thread# 4  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (807e9d0, 807e9e0, 0, 0) + 3b
 fee3eb07 _cond_wait (807e9d0, 807e9e0) + 66
 fee3eb49 cond_wait (807e9d0, 807e9e0) + 21
 fee3eb82 pthread_cond_wait (807e9d0, 807e9e0, fe15ffd8, 8059268) + 1b
 080592fb plugin_read_thread (0) + 10b
 fee43a81 _thr_setup (fe261200) + 4e
 fee43d70 _lwp_start (fe261200, 0, 0, fe15fff8, fee43d70, fe261200)
-----------------  lwp# 5 / thread# 5  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (807e9d0, 807e9e0, 0, 0) + 3b
 fee3eb07 _cond_wait (807e9d0, 807e9e0) + 66
 fee3eb49 cond_wait (807e9d0, 807e9e0) + 21
 fee3eb82 pthread_cond_wait (807e9d0, 807e9e0, fe060fd8, 8059268) + 1b
 080592fb plugin_read_thread (0) + 10b
 fee43a81 _thr_setup (fe261a00) + 4e
 fee43d70 _lwp_start (fe261a00, 0, 0, fe060ff8, fee43d70, fe261a00)
-----------------  lwp# 6 / thread# 6  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (807e9d0, 807e9e0, 0, 0) + 3b
 fee3eb07 _cond_wait (807e9d0, 807e9e0) + 66
 fee3eb49 cond_wait (807e9d0, 807e9e0) + 21
 fee3eb82 pthread_cond_wait (807e9d0, 807e9e0, fdf61fd8, 8059268) + 1b
 080592fb plugin_read_thread (0) + 10b
 fee43a81 _thr_setup (fe262200) + 4e
 fee43d70 _lwp_start (fe262200, 0, 0, fdf61ff8, fee43d70, fe262200)
-----------------  lwp# 7 / thread# 7  --------------------
 fee43dcb lwp_park (0, 0, 6)
 fee3e60e cond_wait_queue (807e9d0, 807e9e0, 0, 0) + 3b
 fee3eb07 _cond_wait (807e9d0, 807e9e0) + 66
 fee3eb49 cond_wait (807e9d0, 807e9e0) + 21
 fee3eb82 pthread_cond_wait (807e9d0, 807e9e0, fde62fd8, 8059268) + 1b
 080592fb plugin_read_thread (0) + 10b
 fee43a81 _thr_setup (fe262a00) + 4e
 fee43d70 _lwp_start (fe262a00, 0, 0, fde62ff8, fee43d70, fe262a00)

However, after sometime the number of rrd_queue_threads keeps on increasing:
# pstack 26921
26921:  /opt/webstack/admin/sbin/collectd -C /etc/opt/webstack/admin/collectd/
.....
.....
....
-----------------  lwp# 8 / thread# 8  --------------------
 fee43dcb lwp_park (0, 0, c)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cae68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe263200) + 4e
 fee43d70 _lwp_start (fe263200, 0, 0, fdd63ff8, fee43d70, fe263200)
-----------------  lwp# 9 / thread# 9  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe263a00) + 4e
 fee43d70 _lwp_start (fe263a00, 0, 0, fdc64ff8, fee43d70, fe263a00)
-----------------  lwp# 10 / thread# 10  --------------------
 fee43dcb lwp_park (0, 0, b)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 8098e30) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe264200) + 4e
 fee43d70 _lwp_start (fe264200, 0, 0, fdb2eff8, fee43d70, fe264200)
-----------------  lwp# 11 / thread# 11  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe264a00) + 4e
 fee43d70 _lwp_start (fe264a00, 0, 0, fda2fff8, fee43d70, fe264a00)
-----------------  lwp# 12 / thread# 12  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe265200) + 4e
 fee43d70 _lwp_start (fe265200, 0, 0, fd930ff8, fee43d70, fe265200)


After looking at the code I figured out that this happens because
update_kstat function calls the plugin_init_all which ends up creating
a new rrd_queue_thread. Is this intentional?

Because of this behviour, If I send a HUP signal to collectdmon to
restart collectd, collectd doesn't get killed. Here is the pstack
output of collectd process after sending a HUP signal to collectdmon.

-----------------  lwp# 1 / thread# 1  --------------------
 fee44ad7 lwp_wait (c, 8046838)
 fee40ce7 _thrp_join (c, 0, 0, 1) + 5a
 fee40e66 pthread_join (c, 0, 1, feb82a4c) + 2b
 feb82ae4 rrd_shutdown (8046cc0, 0, 8047d58, 8054f08, 0, 0) + a4
 08059f86 plugin_shutdown_all (0, 0, 0, 0, 0, 1) + da
 08054f08 main     (4, 8047d90, 8047da4, feffb818) + 404
 0805448d _start   (4, 8047e34, 8047e56, 8047e59, 8047e88, 0) + 7d
-----------------  lwp# 2 / thread# 2  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 8098dd0) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe260200) + 4e
 fee43d70 _lwp_start (fe260200, 0, 0, fe36eff8, fee43d70, fe260200)
-----------------  lwp# 8 / thread# 8  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cae68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe263200) + 4e
 fee43d70 _lwp_start (fe263200, 0, 0, fdd63ff8, fee43d70, fe263200)
-----------------  lwp# 9 / thread# 9  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe263a00) + 4e
 fee43d70 _lwp_start (fe263a00, 0, 0, fdc64ff8, fee43d70, fe263a00)
-----------------  lwp# 10 / thread# 10  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe264200) + 4e
 fee43d70 _lwp_start (fe264200, 0, 0, fdb2eff8, fee43d70, fe264200)
-----------------  lwp# 11 / thread# 11  --------------------
 feb81114 rrd_queue_thread(), exit value = 0x00000000
        ** zombie (exited, not detached, not yet joined) **
-----------------  lwp# 12 / thread# 12  --------------------
 fee43dcb lwp_park (0, 0, 0)
 fee3e60e cond_wait_queue (feb94528, feb94538, 0, 0) + 3b
 fee3eb07 _cond_wait (feb94528, feb94538) + 66
 fee3eb49 cond_wait (feb94528, feb94538) + 21
 fee3eb82 pthread_cond_wait (feb94528, feb94538, 0, 80cad68) + 1b
 feb811c4 rrd_queue_thread (0) + b0
 fee43a81 _thr_setup (fe265200) + 4e
 fee43d70 _lwp_start (fe265200, 0, 0, fd930ff8, fee43d70, fe265200)

As you would see one of the rrd_queue_thread has died (zombied) and
thus collectdmon doesn't restart collectd. Is this a known issue?

Regards
Amit



More information about the collectd mailing list