前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >workqueue相关数据结构在内核crash分析中的实战应用

workqueue相关数据结构在内核crash分析中的实战应用

原创
作者头像
johnazhang
修改2021-05-21 18:17:42
9290
修改2021-05-21 18:17:42
举报
文章被收录于专栏:Linux问题笔记Linux问题笔记

关于workqueue机制的介绍文章比较多,主要就几个核心的数据结构:work_struct/worker/workqueue/worker_pool/

不展开讲这些数据结构,而是通过利用他们的关系在crash分析实战中找到想要的东西。

先说办法,找work_struct,可以通过worker_pool->worklist;找worker属于哪个pool,可以通过worker_pool->idle_list;而找每个CPU对应的worker_pool,可以通过变量cpu_worker_pools。

案例:hung task卡在kworker上了

crash> ps | grep UN

6707 2 0 ffff9518c5544100 UN 0.0 0 0 [kworker/0:0]

18916 2 4 ffff951621d98000 UN 0.0 0 0 [kworker/4:2]

21802 2 4 ffff95154e98c100 UN 0.0 0 0 [kworker/4:0]

crash> bt ffff951621d98000

PID: 18916 TASK: ffff951621d98000 CPU: 4 COMMAND: "kworker/4:2"

#0 [ffff9516238afb98] __schedule at ffffffff92368a32

#1 [ffff9516238afc28] schedule at ffffffff92368ed9

#2 [ffff9516238afc38] schedule_timeout at ffffffff923669e1

#3 [ffff9516238afce8] wait_for_completion at ffffffff9236928d

#4 [ffff9516238afd48] call_usermodehelper_exec at ffffffff91cb6219

#5 [ffff9516238afd90] call_usermodehelper at ffffffff91cb67b0

#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97

#7 [ffff9516238afe20] process_one_work at ffffffff91cb9ebf

#8 [ffff9516238afe68] worker_thread at ffffffff91cbaf56

#9 [ffff9516238afec8] kthread at ffffffff91cc1da1

可以看到,kworker/4:2 卡在 cgroup_release_agent上了,我们现在想知道他身上还有哪些work?多不多?(虽然这些信息不一定能起到什么帮助,但是在分析crash的时候自然是知道越多的信息,不放过任何“多余”的信息)

这里有一个全局的per_cpu worker_pool数据结构,叫cpu_worker_pools:

crash> p cpu_worker_pools

PER-CPU DATA TYPE:

struct worker_pool cpu_worker_pools[2];

PER-CPU ADDRESSES:

[0]: ffff951cff21a400

[1]: ffff951cff25a400

[2]: ffff951cff29a400

[3]: ffff951cff2da400

[4]: ffff951cff31a400

[5]: ffff951cff35a400

[6]: ffff951cff39a400

[7]: ffff951cff3da400

[8]: ffff951cff41a400

[9]: ffff951cff45a400

[10]: ffff951cff49a400

[11]: ffff951cff4da400

[12]: ffff951cff51a400

[13]: ffff951cff55a400

[14]: ffff951cff59a400

[15]: ffff951cff5da400

worker_pool数据结构:

struct worker_pool {

[0x0] spinlock_t lock;

[0x4] int cpu;

[0x8] int node;

[0xc] int id;

[0x10] unsigned int flags;

[0x18] unsigned long watchdog_ts;

[0x20] struct list_head worklist;

[0x30] int nr_workers;

[0x34] int nr_idle;

[0x38] struct list_head idle_list;

[0x48] struct timer_list idle_timer;

[0x98] struct timer_list mayday_timer;

[0xe8] struct hlist_head busy_hash[64];

[0x2e8] struct mutex manager_arb;

[0x310] struct mutex manager_mutex;

[0x338] struct idr worker_idr;

[0x360] struct workqueue_attrs *attrs;

[0x368] struct hlist_node hash_node;

[0x378] int refcnt;

[0x380] atomic_t nr_running;

[0x388] struct callback_head rcu;

}

SIZE: 0x3c0

我们来看下CPU4上的情况:

crash> worker_pool.worklist,idle_list ffff951cff31a400

worklist = {

next = 0xffff95167fd4a8f0,

prev = 0xffff951cff317d08

}

idle_list = {

next = 0xffff951cff31a438,

prev = 0xffff951cff31a438

}

crash> worker_pool.worklist,idle_list ffff951cff31a7c0

worklist = {

next = 0xffff951cff31a7e0,

prev = 0xffff951cff31a7e0

}

idle_list = {

next = 0xffff9517a1dce500,

prev = 0xffff951674bdc200

}

注意这里是两个地址,因为cpu_worker_pools是个大小为2的数组。

worklist是pending的work_struct的链表,idle_list是空闲的kworker的链表(这个一般也没啥用,就是验证一下kworker在哪个pool里)。

随便来看一个kworker:

crash> worker 0xffff951cff31a438

struct worker {

{

entry = {

next = 0xffff951cff31a438,

prev = 0xffff951cff31a438

},

hentry = {

next = 0xffff951cff31a438,

pprev = 0xffff951cff31a438

}

},

current_work = 0x0,

current_func = 0xdead000000000200,

current_pwq = 0x7eb84f800,

desc_valid = true,

scheduled = {

next = 0xffffffff91cb9010,

prev = 0xffff951cff31a400

},

task = 0xffffffffffffffff,

pool = 0x0,

last_active = 0,

flags = 0,

id = 0,

desc = "\330;1\377\034\225\377\377\330;1\377\034\225\377\377+\357\223\353\a\000\000",

rescue_wq = 0xffff951cff313900

这里看起来像是这个worker已经被释放了?可是他为什么还在链表里,这里需要进一步确认源码的实现,这个看现象看起来特别像是RCU用错的bug之类的??如果有大佬知道这里是怎么回事,还请不吝赐教。

再来看一个kworker:

crash> worker 0xffff9517a1dce500

struct worker {

{

entry = {

next = 0xffff951674bdc200,

prev = 0xffff951cff31a7f8

},

hentry = {

next = 0xffff951674bdc200,

pprev = 0xffff951cff31a7f8

}

},

current_work = 0x0,

current_func = 0x0,

current_pwq = 0x0,

desc_valid = false,

scheduled = {

next = 0xffff9517a1dce530,

prev = 0xffff9517a1dce530

},

task = 0xffff951cf4f2e180,

pool = 0xffff951cff31a7c0,

last_active = 34017089150,

flags = 13,

id = 1,

desc = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",

rescue_wq = 0x0

}

crash> bt 0xffff951cf4f2e180

PID: 711 TASK: ffff951cf4f2e180 CPU: 4 COMMAND: "kworker/4:1H"

#0 [ffff951cfc383dc8] __schedule at ffffffff92368a32

#1 [ffff951cfc383e58] schedule at ffffffff92368ed9

#2 [ffff951cfc383e68] worker_thread at ffffffff91cbb009

#3 [ffff951cfc383ec8] kthread at ffffffff91cc1da1

这个就是一个很正常的kworker。

再看看work:

crash> list -l work_struct.entry -s work_struct 0xffff95167fd4a8f0

ffff95167fd4a8f0

struct work_struct {

data = {

counter = -117523203621883

},

entry = {

next = 0xffffffff928b89a8,

prev = 0xffff951cff31a420

},

func = 0xffffffff91e376e0

}

ffffffff928b89a8

struct work_struct {

data = {

counter = -117523203621883

},

entry = {

next = 0xffff951cff317d08,

prev = 0xffff95167fd4a8f0

},

func = 0xffffffff91eeaef0

}

ffff951cff317d08

struct work_struct {

data = {

counter = -117523203621883

},

entry = {

next = 0xffff951cff31a420,

prev = 0xffffffff928b89a8

},

func = 0xffffffff91dd7de0

}

ffff951cff31a420

struct work_struct {

data = {

counter = 34016909432

},

entry = {

next = 0xffff95167fd4a8f0,

prev = 0xffff951cff317d08

},

func = 0x2

}

crash> sym 0xffffffff91dd7de0

ffffffff91dd7de0 (t) vmstat_update /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/mm/vmstat.c: 1245

crash> sym 0xffffffff91e376e0

ffffffff91e376e0 (t) vmpressure_work_fn /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/mm/vmpressure.c: 167

crash> sym 0xffffffff91eeaef0

ffffffff91eeaef0 (t) key_garbage_collector /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/security/keys/gc.c: 174

可以看到work包含了各个模块,暂时没有看到启发性的信息。

这里最后这个0x2又是什么鬼?

没发现什么有用的信息,回到最原始的办法,从栈里分析,不感兴趣可以略过了:

0xffffffff91cbb09c <worker_thread+620>: lea -0x8(%rax),%rsi

0xffffffff91cbb0a0 <worker_thread+624>: mov %r15,%rdi

0xffffffff91cbb0a3 <worker_thread+627>: callq 0xffffffff91cb9d40 <process_one_work>

0xffffffff91cb9d40 <process_one_work>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]

0xffffffff91cb9d45 <process_one_work+5>: push %rbp

0xffffffff91cb9d46 <process_one_work+6>: mov %rsp,%rbp

0xffffffff91cb9d49 <process_one_work+9>: push %r15

#7 [ffff9516238afe20] process_one_work at ffffffff91cb9ebf

ffff9516238afe28: 00000000ff31a420 0000000000000000

ffff9516238afe38: ffff951cff31a420 ffff951602de57b0

ffff9516238afe48: ffff951621d98000 ffff951cff31a400

ffff9516238afe58: ffff951602de5780 ffff9516238afec0

ffff9516238afe68: ffffffff91cbaf56

crash> worker ffff951602de5780

struct worker {

{

entry = {

next = 0x0,

prev = 0xffff951cff31a570

},

hentry = {

next = 0x0,

pprev = 0xffff951cff31a570

}

},

current_work = 0xffffffff92861140,

current_func = 0xffffffff91d22ec0,

current_pwq = 0xffff951cff31fc00,

desc_valid = false,

scheduled = {

next = 0xffff951602de57b0,

prev = 0xffff951602de57b0

},

task = 0xffff951621d98000,

pool = 0xffff951cff31a400,

last_active = 34016908957,

flags = 1,

id = 2,

desc = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",

rescue_wq = 0x0

}

crash> sym 0xffffffff91d22ec0

ffffffff91d22ec0 (t) cgroup_release_agent /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/kernel/cgroup.c: 5066

0xffffffff91cb9eb7 <process_one_work+375>: mov %rbx,%rdi

0xffffffff91cb9eba <process_one_work+378>: callq 0xffffffff91f86ff0 <__x86_indirect_thunk_rax>

crash> dis cgroup_release_agent

0xffffffff91d22ec0 <cgroup_release_agent>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]

0xffffffff91d22ec5 <cgroup_release_agent+5>: push %rbp

0xffffffff91d22ec6 <cgroup_release_agent+6>: mov %rsp,%rbp

0xffffffff91d22ec9 <cgroup_release_agent+9>: push %r15

0xffffffff91d22ecb <cgroup_release_agent+11>: push %r14

0xffffffff91d22ecd <cgroup_release_agent+13>: push %r13

0xffffffff91d22ecf <cgroup_release_agent+15>: push %r12

0xffffffff91d22ed1 <cgroup_release_agent+17>: push %rbx

#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97

ffff9516238afdb8: ffff95160153f080 ffff951be126c000

ffff9516238afdc8: 0000000000000000 ffffffff926508f7

ffff9516238afdd8: ffffffff92668478 0000000000000000

ffff9516238afde8: 0000000089656efe ffffffff92861140

ffff9516238afdf8: ffff951602de5780 ffff951cff31a400

ffff9516238afe08: ffff951cff31fc00 0000000000000100

ffff9516238afe18: ffff9516238afe60 ffffffff91cb9ebf

crash> work_struct ffffffff92861140

struct work_struct {

data = {

counter = 256

},

entry = {

next = 0xffffffff92861148,

prev = 0xffffffff92861148

},

func = 0xffffffff91d22ec0

}

crash> sym 0xffffffff91d22ec0

ffffffff91d22ec0 (t) cgroup_release_agent /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/kernel/cgroup.c: 5066

都是一样的函数,从最早的栈里就能看到,没有什么意义,因此最后只能去看源码。

从cgroup_release_agent这个路径里的源码里得知:

这是一个同步等待rm掉cgroup的过程,是起了一个用户态命令来rm,那么是起了什么命令呢?继续通过栈上找

0xffffffff91d22f81 <cgroup_release_agent+193>: mov -0x60(%rbp),%rdi

0xffffffff91d22f85 <cgroup_release_agent+197>: lea -0x48(%rbp),%rdx

0xffffffff91d22f89 <cgroup_release_agent+201>: lea -0x60(%rbp),%rsi

0xffffffff91d22f8d <cgroup_release_agent+205>: mov $0x1,%ecx

0xffffffff91d22f92 <cgroup_release_agent+210>: callq 0xffffffff91cb6770 <call_usermodehelper>

#6 [ffff9516238afdb0] cgroup_release_agent at ffffffff91d22f97

ffff9516238afdb8: ffff95160153f080 ffff951be126c000

ffff9516238afdc8: 0000000000000000 ffffffff926508f7

ffff9516238afdd8: ffffffff92668478 0000000000000000

ffff9516238afde8: 0000000089656efe ffffffff92861140

ffff9516238afdf8: ffff951602de5780 ffff951cff31a400

ffff9516238afe08: ffff951cff31fc00 0000000000000100

ffff9516238afe18: ffff9516238afe60 ffffffff91cb9ebf

crash> rd ffff95160153f080 32

ffff95160153f080: 62696c2f7273752f 646d65747379732f /usr/lib/systemd

ffff95160153f090: 646d65747379732f 7370756f7267632d /systemd-cgroups

ffff95160153f0a0: 0000746e6567612d 0003000200000001 -agent..........

最后知道是起了/usr/lib/systemd/systemd-cgroups-agent这个命令,然后这个命令卡住了。最终导致了kworker卡住了导致重启。

绕了一大圈,其实在本例中不分析kworker相关的数据结构光从栈上找也能得出最后的结论,但这里就用做抛砖引用,提供一个分析kworker相关数据结构的方法。

总结一下发现:

1. kworker当前正在执行的work是不在worker_pool的pending链表worklist上的,因此如果要找当前正在执行的work,从kworker栈上是最稳妥且正确的方式。

2. 将要执行的work_struct的顺序可以通过worker_pool的worklist上找到。

3. 闲置的kworker可以通过worker_pool的idle_list上找到(从源码来看,worker闲置了一段时间后会被自动释放掉,但是在本例中有个疑似被释放的worker还在idle_list上是为什么??希望懂的大佬可以来不吝赐教)。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档