[collectd] Solaris 10 core dump
collectd at faxm0dem.org
collectd at faxm0dem.org
Thu Sep 22 10:28:31 CEST 2011
Hi,
I'm getting core dumps on Solaris 10 x86.
The program (collectd-5.0.0) runs fine for a few variable minutes, then dies
miserably. This happens on 55 out of 191 servers, not sure why it's failing
only on some. Not all servers have the same kernel patch set, here's the
table relating failures to kernel levels:
#kernel failure? servers
#
Generic_141445-09 OK 5
#
Generic_144489-12 OK 15
Generic_144489-12 NOK 31
#
Generic_144489-17 OK 1
Generic_144489-17 NOK 20
#
Generic_142910-17 OK 115
Generic_142910-17 NOK 4
All 55 seem to die upon "_lwp_start" according to the stack trace.
Source has been compiled on a "Generic_142910-17" kernel using Sun Studio 12.
Please find attached the stack trace.
The debug level logfile of one failing server is available here:
http://paste.scsys.co.uk/144300
Thanks in advance for any help on this.
-------------- next part --------------
% pstack /opt/collectd/var/lib/collectd/core
core '/opt/collectd/var/lib/collectd/core' of 29040: /opt/collectd/sbin/collectd
----------------- lwp# 1 / thread# 1 --------------------
fee0a877 ___nanosleep (80468f8, 80468f8) + 7
fef92b67 nanosleep (80468f8, 80468f8, 1f7f137f, 3ad59af9) + 1b
0805683b do_loop (0, 0, 0, 0, 0, 1) + 217
080571c3 main (1, 8047e7c, 8047e84, fee83f00) + 71b
08055c1d _start (1, 8047f04, 0, 8047f20, 8047f38, 8047f63) + 7d
----------------- lwp# 2 / thread# 2 --------------------
fee0857b __lwp_park (feba0200, 0, 8098220, 0) + b
fee01d47 mutex_lock_impl (8098220, 0) + 102
fee01e20 mutex_lock (8098220, fe9eee74, fe9eeeb4) + 1a
0806b440 uc_get_meta (fe9eed9c, fee7e000, fe9ee918, fea92014) + ac
0806b778 uc_meta_data_add_unsigned_int (fe9eed9c) + 14
fea9746d network_write (80a28a0, fe9eed9c, 80a41bc, feb711b6) + 13d
0806461b plugin_write (0, 80a28a0, fe9eed9c, 8065fb8) + 14f
0805f36b fc_bit_write_invoke (80a28a0, fe9eed9c, 0, 0) + 63
08060039 fc_default_action (80a28a0, fe9eed9c, fe9eeeb4, 8064d2a) + 21
080651c4 plugin_dispatch_values (fe9eed9c, feb50e14, 40, 2) + 68c
feb50b79 submit (2, feb50e14, 337d85, 0) + 47d
feb50ced cpu_read (0, 247b0e59, 247b0e59, 247b0e59, 1000000, 0) + 165
08062418 plugin_read_thread (0) + 44c
fee08236 _thr_setup (feba0200) + 4e
fee08520 _lwp_start (feba0200, 0, 0, fe9eeff8, fee08520, feba0200)
----------------- lwp# 3 / thread# 3 --------------------
fee0b297 _lwp_kill (3, 6) + 7
fedb2d7b raise (6) + 1f
fed92171 abort (feba0a00, fea613a8, 65737341, 6f697472, 6166206e, 64656c69) + cd
fed9238b _assert (8082f5c, 8082f50, 27f, fea50a0c) + 6b
080589f6 get_kstat_value (0, fea61644, 20ae4f, 0) + 3e
fea50efd za_read (0, 247b1114, 247b1114, 247b1114, 1000000, 0) + 3b5
08062418 plugin_read_thread (0) + 44c
fee08236 _thr_setup (feba0a00) + 4e
fee08520 _lwp_start (feba0a00, 0, 0, fe8efff8, fee08520, feba0a00)
----------------- lwp# 4 / thread# 4 --------------------
fee0857b __lwp_park (feba1200, 0, 8098220, 0) + b
fee01d47 mutex_lock_impl (8098220, 0) + 102
fee01e20 mutex_lock (8098220, fe7f0814, fe7f0854) + 1a
0806b440 uc_get_meta (fe7f073c, fee7e000, fe7f02b8, fea92014) + ac
0806b778 uc_meta_data_add_unsigned_int (fe7f073c) + 14
fea9746d network_write (80a2cc0, fe7f073c, 80a41bc, 18) + 13d
0806461b plugin_write (0, 80a2cc0, fe7f073c, 8065fb8) + 14f
0805f36b fc_bit_write_invoke (80a2cc0, fe7f073c, 0, 0) + 63
08060039 fc_default_action (80a2cc0, fe7f073c, fe7f0854, 8064d2a) + 21
080651c4 plugin_dispatch_values (fe7f073c, feb33a7c, 40, feb313ec) + 68c
feb31877 df_submit_one (fe7f0dbc, feb33a84, feb33a7c, 0, 41df8c32) + 49b
feb31e1d df_read (0, 247db999, 247db999, 247db999, 1000000, 0) + 595
08062418 plugin_read_thread (0) + 44c
fee08236 _thr_setup (feba1200) + 4e
fee08520 _lwp_start (feba1200, 0, 0, fe7f0ff8, fee08520, feba1200)
----------------- lwp# 5 / thread# 5 --------------------
fee0857b __lwp_park (80981f0, 8098200, fe6f1f00) + b
fee02da6 cond_wait_queue (80981f0, 8098200, fe6f1f00) + 5e
fee03123 cond_wait_common (80981f0, 8098200, fe6f1f00) + 1db
fee03355 _cond_timedwait (80981f0, 8098200, 80a1798) + 51
fee033c0 cond_timedwait (80981f0, 8098200, 80a1798) + 24
fee033fc pthread_cond_timedwait (80981f0, 8098200, 80a1798, 806272a) + 1e
080622e8 plugin_read_thread (0) + 31c
fee08236 _thr_setup (feba1a00) + 4e
fee08520 _lwp_start (feba1a00, 0, 0, fe6f1ff8, fee08520, feba1a00)
----------------- lwp# 6 / thread# 6 --------------------
fee085ab __lwp_unpark (8098220, 80c30b4, fe5f2b98, 806a138) + b
0806a17d uc_update (80a3180, fe5f2d90, fe5f2ea8, 8064d2a) + a55
08065147 plugin_dispatch_values (fe5f2d90, feb1138c, 40, feb10b14) + 60f
feb10f9d disk_submit (80fd69c, feb1138c, 2c403de9, 4d, e4ac1800, f) + 499
feb110db disk_read (0, 24958599, 24958599, 24958599, 1000000, 0) + 12f
08062418 plugin_read_thread (0) + 44c
fee08236 _thr_setup (feba2200) + 4e
fee08520 _lwp_start (feba2200, 0, 0, fe5f2ff8, fee08520, feba2200)
More information about the collectd
mailing list