关于workqueue机制的介绍文章比较多,主要就几个核心的数据结构:work_struct/worker/workqueue/worker_pool/
不展开讲这些数据结构,而是通过利用他们的关系在crash分析实战中找到想要的东西。
先说办法,找work_struct,可以通过worker_pool->worklist;找worker属于哪个pool,可以通过worker_pool->idle_list;而找每个CPU对应的worker_pool,可以通过变量cpu_worker_pools。
案例:hung task卡在kworker上了
crash> ps | grep UN
6707 2 0 ffff9518c5544100 UN 0.0 0 0 [kworker/0:0]
18916 2 4 ffff951621d98000 UN 0.0 0 0 [kworker/4:2]
21802 2 4 ffff95154e98c100 UN 0.0 0 0 [kworker/4:0]
crash> bt ffff951621d98000
PID: 18916 TASK: ffff951621d98000 CPU: 4 COMMAND: "kworker/4:2"
#0 [ffff9516238afb98] __schedule at ffffffff92368a32
#1 [ffff9516238afc28] schedule at ffffffff92368ed9
#2 [ffff9516238afc38] schedule_timeout at ffffffff923669e1
#3 [ffff9516238afce8] wait_for_completion at ffffffff9236928d
#4 [ffff9516238afd48] call_usermodehelper_exec at ffffffff91cb6219
#5 [ffff9516238afd90] call_usermodehelper at ffffffff91cb67b0
#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97
#7 [ffff9516238afe20] process_one_work at ffffffff91cb9ebf
#8 [ffff9516238afe68] worker_thread at ffffffff91cbaf56
#9 [ffff9516238afec8] kthread at ffffffff91cc1da1
可以看到,kworker/4:2 卡在 cgroup_release_agent上了,我们现在想知道他身上还有哪些work?多不多?(虽然这些信息不一定能起到什么帮助,但是在分析crash的时候自然是知道越多的信息,不放过任何“多余”的信息)
这里有一个全局的per_cpu worker_pool数据结构,叫cpu_worker_pools:
crash> p cpu_worker_pools
PER-CPU DATA TYPE:
struct worker_pool cpu_worker_pools[2];
PER-CPU ADDRESSES:
[0]: ffff951cff21a400
[1]: ffff951cff25a400
[2]: ffff951cff29a400
[3]: ffff951cff2da400
[4]: ffff951cff31a400
[5]: ffff951cff35a400
[6]: ffff951cff39a400
[7]: ffff951cff3da400
[8]: ffff951cff41a400
[9]: ffff951cff45a400
[10]: ffff951cff49a400
[11]: ffff951cff4da400
[12]: ffff951cff51a400
[13]: ffff951cff55a400
[14]: ffff951cff59a400
[15]: ffff951cff5da400
worker_pool数据结构:
struct worker_pool {
[0x0] spinlock_t lock;
[0x4] int cpu;
[0x8] int node;
[0xc] int id;
[0x10] unsigned int flags;
[0x18] unsigned long watchdog_ts;
[0x20] struct list_head worklist;
[0x30] int nr_workers;
[0x34] int nr_idle;
[0x38] struct list_head idle_list;
[0x48] struct timer_list idle_timer;
[0x98] struct timer_list mayday_timer;
[0xe8] struct hlist_head busy_hash[64];
[0x2e8] struct mutex manager_arb;
[0x310] struct mutex manager_mutex;
[0x338] struct idr worker_idr;
[0x360] struct workqueue_attrs *attrs;
[0x368] struct hlist_node hash_node;
[0x378] int refcnt;
[0x380] atomic_t nr_running;
[0x388] struct callback_head rcu;
}
SIZE: 0x3c0
我们来看下CPU4上的情况:
crash> worker_pool.worklist,idle_list ffff951cff31a400
worklist = {
next = 0xffff95167fd4a8f0,
prev = 0xffff951cff317d08
}
idle_list = {
next = 0xffff951cff31a438,
prev = 0xffff951cff31a438
}
crash> worker_pool.worklist,idle_list ffff951cff31a7c0
worklist = {
next = 0xffff951cff31a7e0,
prev = 0xffff951cff31a7e0
}
idle_list = {
next = 0xffff9517a1dce500,
prev = 0xffff951674bdc200
}
注意这里是两个地址,因为cpu_worker_pools是个大小为2的数组。
worklist是pending的work_struct的链表,idle_list是空闲的kworker的链表(这个一般也没啥用,就是验证一下kworker在哪个pool里)。
随便来看一个kworker:
crash> worker 0xffff951cff31a438
struct worker {
{
entry = {
next = 0xffff951cff31a438,
prev = 0xffff951cff31a438
},
hentry = {
next = 0xffff951cff31a438,
pprev = 0xffff951cff31a438
}
},
current_work = 0x0,
current_func = 0xdead000000000200,
current_pwq = 0x7eb84f800,
desc_valid = true,
scheduled = {
next = 0xffffffff91cb9010,
prev = 0xffff951cff31a400
},
task = 0xffffffffffffffff,
pool = 0x0,
last_active = 0,
flags = 0,
id = 0,
desc = "\330;1\377\034\225\377\377\330;1\377\034\225\377\377+\357\223\353\a\000\000",
rescue_wq = 0xffff951cff313900
这里看起来像是这个worker已经被释放了?可是他为什么还在链表里,这里需要进一步确认源码的实现,这个看现象看起来特别像是RCU用错的bug之类的??如果有大佬知道这里是怎么回事,还请不吝赐教。
再来看一个kworker:
crash> worker 0xffff9517a1dce500
struct worker {
{
entry = {
next = 0xffff951674bdc200,
prev = 0xffff951cff31a7f8
},
hentry = {
next = 0xffff951674bdc200,
pprev = 0xffff951cff31a7f8
}
},
current_work = 0x0,
current_func = 0x0,
current_pwq = 0x0,
desc_valid = false,
scheduled = {
next = 0xffff9517a1dce530,
prev = 0xffff9517a1dce530
},
task = 0xffff951cf4f2e180,
pool = 0xffff951cff31a7c0,
last_active = 34017089150,
flags = 13,
id = 1,
desc = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
rescue_wq = 0x0
}
crash> bt 0xffff951cf4f2e180
PID: 711 TASK: ffff951cf4f2e180 CPU: 4 COMMAND: "kworker/4:1H"
#0 [ffff951cfc383dc8] __schedule at ffffffff92368a32
#1 [ffff951cfc383e58] schedule at ffffffff92368ed9
#2 [ffff951cfc383e68] worker_thread at ffffffff91cbb009
#3 [ffff951cfc383ec8] kthread at ffffffff91cc1da1
这个就是一个很正常的kworker。
再看看work:
crash> list -l work_struct.entry -s work_struct 0xffff95167fd4a8f0
ffff95167fd4a8f0
struct work_struct {
data = {
counter = -117523203621883
},
entry = {
next = 0xffffffff928b89a8,
prev = 0xffff951cff31a420
},
func = 0xffffffff91e376e0
}
ffffffff928b89a8
struct work_struct {
data = {
counter = -117523203621883
},
entry = {
next = 0xffff951cff317d08,
prev = 0xffff95167fd4a8f0
},
func = 0xffffffff91eeaef0
}
ffff951cff317d08
struct work_struct {
data = {
counter = -117523203621883
},
entry = {
next = 0xffff951cff31a420,
prev = 0xffffffff928b89a8
},
func = 0xffffffff91dd7de0
}
ffff951cff31a420
struct work_struct {
data = {
counter = 34016909432
},
entry = {
next = 0xffff95167fd4a8f0,
prev = 0xffff951cff317d08
},
func = 0x2
}
crash> sym 0xffffffff91dd7de0
ffffffff91dd7de0 (t) vmstat_update /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/mm/vmstat.c: 1245
crash> sym 0xffffffff91e376e0
ffffffff91e376e0 (t) vmpressure_work_fn /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/mm/vmpressure.c: 167
crash> sym 0xffffffff91eeaef0
ffffffff91eeaef0 (t) key_garbage_collector /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/security/keys/gc.c: 174
可以看到work包含了各个模块,暂时没有看到启发性的信息。
这里最后这个0x2又是什么鬼?
没发现什么有用的信息,回到最原始的办法,从栈里分析,不感兴趣可以略过了:
0xffffffff91cbb09c <worker_thread+620>: lea -0x8(%rax),%rsi
0xffffffff91cbb0a0 <worker_thread+624>: mov %r15,%rdi
0xffffffff91cbb0a3 <worker_thread+627>: callq 0xffffffff91cb9d40 <process_one_work>
0xffffffff91cb9d40 <process_one_work>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff91cb9d45 <process_one_work+5>: push %rbp
0xffffffff91cb9d46 <process_one_work+6>: mov %rsp,%rbp
0xffffffff91cb9d49 <process_one_work+9>: push %r15
#7 [ffff9516238afe20] process_one_work at ffffffff91cb9ebf
ffff9516238afe28: 00000000ff31a420 0000000000000000
ffff9516238afe38: ffff951cff31a420 ffff951602de57b0
ffff9516238afe48: ffff951621d98000 ffff951cff31a400
ffff9516238afe58: ffff951602de5780 ffff9516238afec0
ffff9516238afe68: ffffffff91cbaf56
crash> worker ffff951602de5780
struct worker {
{
entry = {
next = 0x0,
prev = 0xffff951cff31a570
},
hentry = {
next = 0x0,
pprev = 0xffff951cff31a570
}
},
current_work = 0xffffffff92861140,
current_func = 0xffffffff91d22ec0,
current_pwq = 0xffff951cff31fc00,
desc_valid = false,
scheduled = {
next = 0xffff951602de57b0,
prev = 0xffff951602de57b0
},
task = 0xffff951621d98000,
pool = 0xffff951cff31a400,
last_active = 34016908957,
flags = 1,
id = 2,
desc = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
rescue_wq = 0x0
}
crash> sym 0xffffffff91d22ec0
ffffffff91d22ec0 (t) cgroup_release_agent /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/kernel/cgroup.c: 5066
0xffffffff91cb9eb7 <process_one_work+375>: mov %rbx,%rdi
0xffffffff91cb9eba <process_one_work+378>: callq 0xffffffff91f86ff0 <__x86_indirect_thunk_rax>
crash> dis cgroup_release_agent
0xffffffff91d22ec0 <cgroup_release_agent>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff91d22ec5 <cgroup_release_agent+5>: push %rbp
0xffffffff91d22ec6 <cgroup_release_agent+6>: mov %rsp,%rbp
0xffffffff91d22ec9 <cgroup_release_agent+9>: push %r15
0xffffffff91d22ecb <cgroup_release_agent+11>: push %r14
0xffffffff91d22ecd <cgroup_release_agent+13>: push %r13
0xffffffff91d22ecf <cgroup_release_agent+15>: push %r12
0xffffffff91d22ed1 <cgroup_release_agent+17>: push %rbx
#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97
ffff9516238afdb8: ffff95160153f080 ffff951be126c000
ffff9516238afdc8: 0000000000000000 ffffffff926508f7
ffff9516238afdd8: ffffffff92668478 0000000000000000
ffff9516238afde8: 0000000089656efe ffffffff92861140
ffff9516238afdf8: ffff951602de5780 ffff951cff31a400
ffff9516238afe08: ffff951cff31fc00 0000000000000100
ffff9516238afe18: ffff9516238afe60 ffffffff91cb9ebf
crash> work_struct ffffffff92861140
struct work_struct {
data = {
counter = 256
},
entry = {
next = 0xffffffff92861148,
prev = 0xffffffff92861148
},
func = 0xffffffff91d22ec0
}
crash> sym 0xffffffff91d22ec0
ffffffff91d22ec0 (t) cgroup_release_agent /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/kernel/cgroup.c: 5066
都是一样的函数,从最早的栈里就能看到,没有什么意义,因此最后只能去看源码。
从cgroup_release_agent这个路径里的源码里得知:
这是一个同步等待rm掉cgroup的过程,是起了一个用户态命令来rm,那么是起了什么命令呢?继续通过栈上找
0xffffffff91d22f81 <cgroup_release_agent+193>: mov -0x60(%rbp),%rdi
0xffffffff91d22f85 <cgroup_release_agent+197>: lea -0x48(%rbp),%rdx
0xffffffff91d22f89 <cgroup_release_agent+201>: lea -0x60(%rbp),%rsi
0xffffffff91d22f8d <cgroup_release_agent+205>: mov $0x1,%ecx
0xffffffff91d22f92 <cgroup_release_agent+210>: callq 0xffffffff91cb6770 <call_usermodehelper>
#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97
ffff9516238afdb8: ffff95160153f080 ffff951be126c000
ffff9516238afdc8: 0000000000000000 ffffffff926508f7
ffff9516238afdd8: ffffffff92668478 0000000000000000
ffff9516238afde8: 0000000089656efe ffffffff92861140
ffff9516238afdf8: ffff951602de5780 ffff951cff31a400
ffff9516238afe08: ffff951cff31fc00 0000000000000100
ffff9516238afe18: ffff9516238afe60 ffffffff91cb9ebf
crash> rd ffff95160153f080 32
ffff95160153f080: 62696c2f7273752f 646d65747379732f /usr/lib/systemd
ffff95160153f090: 646d65747379732f 7370756f7267632d /systemd-cgroups
ffff95160153f0a0: 0000746e6567612d 0003000200000001 -agent..........
最后知道是起了/usr/lib/systemd/systemd-cgroups-agent这个命令,然后这个命令卡住了。最终导致了kworker卡住了导致重启。
绕了一大圈,其实在本例中不分析kworker相关的数据结构光从栈上找也能得出最后的结论,但这里就用做抛砖引用,提供一个分析kworker相关数据结构的方法。
总结一下发现:
1. kworker当前正在执行的work是不在worker_pool的pending链表worklist上的,因此如果要找当前正在执行的work,从kworker栈上是最稳妥且正确的方式。
2. 将要执行的work_struct的顺序可以通过worker_pool的worklist上找到。
3. 闲置的kworker可以通过worker_pool的idle_list上找到(从源码来看,worker闲置了一段时间后会被自动释放掉,但是在本例中有个疑似被释放的worker还在idle_list上是为什么??希望懂的大佬可以来不吝赐教)。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。