[collectd] Solaris 10 core dump

collectd at faxm0dem.org collectd at faxm0dem.org
Thu Sep 22 10:28:31 CEST 2011


Hi,

I'm getting core dumps on Solaris 10 x86.
The program (collectd-5.0.0) runs fine for a few variable minutes, then dies
miserably. This happens on 55 out of 191 servers, not sure why it's failing
only on some. Not all servers have the same kernel patch set, here's the
table relating failures to kernel levels:

#kernel           failure?  servers
#
Generic_141445-09 OK         5
#
Generic_144489-12 OK        15
Generic_144489-12 NOK       31
#
Generic_144489-17 OK         1
Generic_144489-17 NOK       20
#
Generic_142910-17 OK       115
Generic_142910-17 NOK        4

All 55 seem to die upon "_lwp_start" according to the stack trace.

Source has been compiled on a "Generic_142910-17" kernel using Sun Studio 12.
Please find attached the stack trace.
The debug level logfile of one failing server is available here:
http://paste.scsys.co.uk/144300

Thanks in advance for any help on this.

-------------- next part --------------
% pstack /opt/collectd/var/lib/collectd/core
core '/opt/collectd/var/lib/collectd/core' of 29040:    /opt/collectd/sbin/collectd
-----------------  lwp# 1 / thread# 1  --------------------
 fee0a877 ___nanosleep (80468f8, 80468f8) + 7
 fef92b67 nanosleep (80468f8, 80468f8, 1f7f137f, 3ad59af9) + 1b
 0805683b do_loop  (0, 0, 0, 0, 0, 1) + 217
 080571c3 main     (1, 8047e7c, 8047e84, fee83f00) + 71b
 08055c1d _start   (1, 8047f04, 0, 8047f20, 8047f38, 8047f63) + 7d
-----------------  lwp# 2 / thread# 2  --------------------
 fee0857b __lwp_park (feba0200, 0, 8098220, 0) + b
 fee01d47 mutex_lock_impl (8098220, 0) + 102
 fee01e20 mutex_lock (8098220, fe9eee74, fe9eeeb4) + 1a
 0806b440 uc_get_meta (fe9eed9c, fee7e000, fe9ee918, fea92014) + ac
 0806b778 uc_meta_data_add_unsigned_int (fe9eed9c) + 14
 fea9746d network_write (80a28a0, fe9eed9c, 80a41bc, feb711b6) + 13d
 0806461b plugin_write (0, 80a28a0, fe9eed9c, 8065fb8) + 14f
 0805f36b fc_bit_write_invoke (80a28a0, fe9eed9c, 0, 0) + 63
 08060039 fc_default_action (80a28a0, fe9eed9c, fe9eeeb4, 8064d2a) + 21
 080651c4 plugin_dispatch_values (fe9eed9c, feb50e14, 40, 2) + 68c
 feb50b79 submit   (2, feb50e14, 337d85, 0) + 47d
 feb50ced cpu_read (0, 247b0e59, 247b0e59, 247b0e59, 1000000, 0) + 165
 08062418 plugin_read_thread (0) + 44c
 fee08236 _thr_setup (feba0200) + 4e
 fee08520 _lwp_start (feba0200, 0, 0, fe9eeff8, fee08520, feba0200)
-----------------  lwp# 3 / thread# 3  --------------------
 fee0b297 _lwp_kill (3, 6) + 7
 fedb2d7b raise    (6) + 1f
 fed92171 abort    (feba0a00, fea613a8, 65737341, 6f697472, 6166206e, 64656c69) + cd
 fed9238b _assert  (8082f5c, 8082f50, 27f, fea50a0c) + 6b
 080589f6 get_kstat_value (0, fea61644, 20ae4f, 0) + 3e
 fea50efd za_read  (0, 247b1114, 247b1114, 247b1114, 1000000, 0) + 3b5
 08062418 plugin_read_thread (0) + 44c
 fee08236 _thr_setup (feba0a00) + 4e
 fee08520 _lwp_start (feba0a00, 0, 0, fe8efff8, fee08520, feba0a00)
-----------------  lwp# 4 / thread# 4  --------------------
 fee0857b __lwp_park (feba1200, 0, 8098220, 0) + b
 fee01d47 mutex_lock_impl (8098220, 0) + 102
 fee01e20 mutex_lock (8098220, fe7f0814, fe7f0854) + 1a
 0806b440 uc_get_meta (fe7f073c, fee7e000, fe7f02b8, fea92014) + ac
 0806b778 uc_meta_data_add_unsigned_int (fe7f073c) + 14
 fea9746d network_write (80a2cc0, fe7f073c, 80a41bc, 18) + 13d
 0806461b plugin_write (0, 80a2cc0, fe7f073c, 8065fb8) + 14f
 0805f36b fc_bit_write_invoke (80a2cc0, fe7f073c, 0, 0) + 63
 08060039 fc_default_action (80a2cc0, fe7f073c, fe7f0854, 8064d2a) + 21
 080651c4 plugin_dispatch_values (fe7f073c, feb33a7c, 40, feb313ec) + 68c
 feb31877 df_submit_one (fe7f0dbc, feb33a84, feb33a7c, 0, 41df8c32) + 49b
 feb31e1d df_read  (0, 247db999, 247db999, 247db999, 1000000, 0) + 595
 08062418 plugin_read_thread (0) + 44c
 fee08236 _thr_setup (feba1200) + 4e
 fee08520 _lwp_start (feba1200, 0, 0, fe7f0ff8, fee08520, feba1200)
-----------------  lwp# 5 / thread# 5  --------------------
 fee0857b __lwp_park (80981f0, 8098200, fe6f1f00) + b
 fee02da6 cond_wait_queue (80981f0, 8098200, fe6f1f00) + 5e
 fee03123 cond_wait_common (80981f0, 8098200, fe6f1f00) + 1db
 fee03355 _cond_timedwait (80981f0, 8098200, 80a1798) + 51
 fee033c0 cond_timedwait (80981f0, 8098200, 80a1798) + 24
 fee033fc pthread_cond_timedwait (80981f0, 8098200, 80a1798, 806272a) + 1e
 080622e8 plugin_read_thread (0) + 31c
 fee08236 _thr_setup (feba1a00) + 4e
 fee08520 _lwp_start (feba1a00, 0, 0, fe6f1ff8, fee08520, feba1a00)
-----------------  lwp# 6 / thread# 6  --------------------
 fee085ab __lwp_unpark (8098220, 80c30b4, fe5f2b98, 806a138) + b
 0806a17d uc_update (80a3180, fe5f2d90, fe5f2ea8, 8064d2a) + a55
 08065147 plugin_dispatch_values (fe5f2d90, feb1138c, 40, feb10b14) + 60f
 feb10f9d disk_submit (80fd69c, feb1138c, 2c403de9, 4d, e4ac1800, f) + 499
 feb110db disk_read (0, 24958599, 24958599, 24958599, 1000000, 0) + 12f
 08062418 plugin_read_thread (0) + 44c
 fee08236 _thr_setup (feba2200) + 4e
 fee08520 _lwp_start (feba2200, 0, 0, fe5f2ff8, fee08520, feba2200)


More information about the collectd mailing list