前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >从一个softlock问题来谈谈Kernel IPI的实现

从一个softlock问题来谈谈Kernel IPI的实现

原创
作者头像
cdh
修改2020-07-29 22:23:26
4.8K3
修改2020-07-29 22:23:26
举报
文章被收录于专栏:笔记+笔记+

问题环境:

X86-64 架构,Kernel Ver:Centos7 3.10.0-693.el7.x86_64

从bt栈信息可以看到CPU2 卡在 smp_call_function_many函数中导致watchdog超时触发系统重启:

crash> bt

PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"

bt: page excluded: kernel virtual address: ffffffffffffffff type: "cpu_online_map"

#0 [ffff88043fd03cf8] machine_kexec at ffffffff8105c4cb

#1 [ffff88043fd03d58] __crash_kexec at ffffffff81104a32

#2 [ffff88043fd03e28] panic at ffffffff8169dc5f

#3 [ffff88043fd03ea8] watchdog_timer_fn at ffffffff8112f651

#4 [ffff88043fd03ee0] __hrtimer_run_queues at ffffffff810b4ae4

#5 [ffff88043fd03f38] hrtimer_interrupt at ffffffff810b507f

#6 [ffff88043fd03f80] local_apic_timer_interrupt at ffffffff81053895

#7 [ffff88043fd03f98] smp_apic_timer_interrupt at ffffffff816b76bd

#8 [ffff88043fd03fb0] apic_timer_interrupt at ffffffff816b5c1d

--- <IRQ stack> ---

#9 [ffff88000399b738] apic_timer_interrupt at ffffffff816b5c1d

[exception RIP: smp_call_function_many+514]

RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202

RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8

RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000

RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9

R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292

R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000

ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018

#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668

#11 [ffff88000399b878] flush_tlb_page at ffffffff8106e864

结合smp_call_function_many函数反汇编以及源码推测CPU2是在csd_lock_wait中一直执行while语句:

crash> dis smp_call_function_many

...

0xffffffff810f99a0 <smp_call_function_many+512>: pause

0xffffffff810f99a2 <smp_call_function_many+514>: testb $0x1,0x20(%rcx)

0xffffffff810f99a6 <smp_call_function_many+518>: jne 0xffffffff810f99a0 <smp_call_function_many+512>

...

代码语言:javascript
复制
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
 ...
if (wait) {
//按顺序等各个接收到本cpu ipi信号的其他cpu修改csd的flag
                for_each_cpu(cpu, cfd->cpumask) {
                        struct call_single_data *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                //等待percpu变量csd的flag被置为~CSD_FLAG_LOCK,代表对应cpu已经响应并处理完本
                 cpu发送的ipi回调
                        csd_lock_wait(csd);   
                }
        }
....
}
enum {
        CSD_FLAG_LOCK           = 0x01,
        CSD_FLAG_WAIT           = 0x02,
};
static void csd_lock_wait(struct call_single_data *csd)
{
        while (csd->flags & CSD_FLAG_LOCK)
                cpu_relax();
}

接收到IPI信号的CPU执行完回调函数后会对csd的flags清零:

代码语言:javascript
复制
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
....
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
                csd->func(csd->info);//执行ipi回调函数
                csd_unlock(csd);//对csd的flags清零
        }
...
}
static void csd_unlock(struct call_single_data *csd)
{
  ...      
  csd->flags &= ~CSD_FLAG_LOCK;
}

要证明我们上面的推测是成立的,那么需要找出call_singel_data类型指针变量csd的地址:

代码语言:javascript
复制
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
  ...
 cfd = &__get_cpu_var(cfd_data);//获取当前cpu的cfd_data地址
  ...
 if (wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        struct call_single_data *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);//从cfd中获取per-cpu变量csd
                        csd_lock_wait(csd);
                }
        }
}
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))

问题设备一共有四个cpu:

代码语言:javascript
复制

crash> p nr_cpu_ids
nr_cpu_ids = $2 = 4
crash> 
crash> px cfd_data
PER-CPU DATA TYPE:
  struct call_function_data cfd_data;
PER-CPU ADDRESSES:
  [0]: ffff88043fc17840
  [1]: ffff88043fc97840
  [2]: ffff88043fd17840
  [3]: ffff88043fd97840
crash> 

阻塞在smp_call_function_many函数中的是cpu2:

代码语言:javascript
复制
crash> px cfd_data:2
per_cpu(cfd_data, 2) = $3 = {
  csd = 0x1adb8, 
  cpumask = 0xffff88017a1ee000
}
crash> 

per_cpu(cfd_data,2)中cpumask的值是10,10的值转为二进制就是bit3和bit1值为1,对应cpu3和cpu1。

crash> struct cpumask 0xffff88017a1ee000

struct cpumask {

bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

}

crash>

crash> eval -b 10

hexadecimal: a

decimal: 10

octal: 12

binary: 0000000000000000000000000000000000000000000000000000000000001010

bits set: 3 1

crash>

per_cpu(cfd_data,2)中cpumask对应的cpu位其实就代表本cpu给哪些cpu发送了ipi信号,在

本问题的vmcore中代表cpu2给cpu1和cpu3发送了IPI信号:

代码语言:javascript
复制
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
      struct call_function_data *cfd;
        ...
        cfd = &__get_cpu_var(cfd_data);//获取本cpu对应的cfd_data地址
        cpumask_and(cfd->cpumask, mask, cpu_online_mask);//cfd->cpumask=mask&cpu_online_mask
        cpumask_clear_cpu(this_cpu, cfd->cpumask);//清除本cpu对应的掩码位,也就是不用给自己发送ipi
        ...
       for_each_cpu(cpu, cfd->cpumask) {//对percpu变量csd进行初始化
                struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
                csd_lock(csd);
                csd->func = func;//收到ipi信号的cpu执行的回调函数
                csd->info = info;
                //发起ipi的一方,通过构造一个csd挂到对方cpu的call_single_queue percpu变量中
                //csd->llist添加到call_single_queue对应中
                llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
        }

        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi_mask(cfd->cpumask);//给cpumask中对应bit位的cpu发送ipi信号

        if (wait) {
             //按顺序等各个接收到本cpu ipi信号的其他cpu修改csd的flag
                for_each_cpu(cpu, cfd->cpumask) {
                        struct call_single_data *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

查看per_cpu_ptr的实现是通过将本cpu对应cfd->csd的值加上接收IPI信号的cpu号为偏移的数组__per_cpu_offset成员的值作为csd的地址值:

代码语言:javascript
复制
#ifndef __per_cpu_offset
extern unsigned long __per_cpu_offset[NR_CPUS];
#define per_cpu_offset(x) (__per_cpu_offset[x]) //从__per_cpu_offset数组获取percpu变量基地址,x为
#endif                                           // cpu号,本次问题x对应cpu1和cpu3
#ifndef SHIFT_PERCPU_PTR        
/* Weird cast keeps both GCC and sparse happy. */
#define SHIFT_PERCPU_PTR(__p, __offset) ({                              \
        __verify_pcpu_ptr((__p));                                       \
        RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \
})
#endif

#ifdef CONFIG_SMP 
#define per_cpu_ptr(ptr, cpu)   SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
#else                   
#define per_cpu_ptr(ptr, cpu)   ({ (void)(cpu); VERIFY_PERCPU_PTR((ptr)); })
#endif   

voidsmp_call_function_many(const struct cpumask *mask, smp_call_func_t func,void*info, bool wait)
{
    if (wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        struct call_single_data *csd;
                        //对应本次问题这里cfd->csd为cpu2的值,cpu为1和3
                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }

}

四核的机器,__per_cpu_offset只有前面四个数组成员有效:

crash> px __per_cpu_offset

__per_cpu_offset = $7 =

{0xffff88043fc00000, 0xffff88043fc80000, 0xffff88043fd00000, 0xffff88043fd80000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000,...,...

}

或者用kmem -o查看per cpu 变量基准地址

crash> kmem -o

PER-CPU OFFSET VALUES:

CPU 0: ffff88043fc00000

CPU 1: ffff88043fc80000

CPU 2: ffff88043fd00000

CPU 3: ffff88043fd80000

crash>

从cpu2对应的per_cpu(cfd_data, 2)的cpumask知道,cpu2本次只给cpu1和cpu3发送了IPI信号,

而cpu2对应的cfd->csd值为0x1adb8

crash> p cfd_data:2

per_cpu(cfd_data, 2) = $8 = {

csd = 0x1adb8,

cpumask = 0xffff88017a1ee000

}

rash> struct cpumask 0xffff88017a1ee000

struct cpumask {

bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

}

crash> eval -b 10

hexadecimal: a

decimal: 10

octal: 12

binary: 0000000000000000000000000000000000000000000000000000000000001010

bits set: 3 1

crash>

cpu1和cpu3的percpu变量基准地址分别为ffff88043fc80000 和 ffff88043fd80000

那么发送给cpu1的csd地址为0xffff88043fc9adb8

crash> px (0xffff88043fc80000+0x1adb8)

$11 = 0xffff88043fc9adb8

crash>

call_single_data .flags为0代表cpu1已经响应并处理了IPI信号:

crash> struct call_single_data 0xffff88043fc9adb8

struct call_single_data {

{

llist = {

next = 0x0

},

__UNIQUE_ID_rh_kabi_hide4 = {

list = {

next = 0x0,

prev = 0x0

}

},

{<No data fields>}

},

func = 0xffffffff8106e4a0 <flush_tlb_func>,

info = 0xffff88000399b830,

flags = 0

}

crash>

发送给cpu3的csd地址为0xffff88043fd9adb8

crash> px (0xffff88043fd80000+0x1adb8)

$12 = 0xffff88043fd9adb8

crash>

call_single_data .flags为1代表cpu3并未响应并处理了IPI信号:

crash> struct call_single_data 0xffff88043fd9adb8

struct call_single_data {

{

llist = {

next = 0xffff88043fd979c0

},

__UNIQUE_ID_rh_kabi_hide4 = {

list = {

next = 0xffff88043fd979c0,

prev = 0x0

}

},

{<No data fields>}

},

func = 0xffffffff8106e4a0 <flush_tlb_func>,

info = 0xffff88000399b830,

flags = 1

}

crash>

其实我们有更方便的方法可以找出死机时cpu2正在等待哪个csd的flags值被清零.

查看栈信息死机时对应RIP指令为smp_call_function_many+514:

crash> bt

PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"

...

[exception RIP: smp_call_function_many+514]

RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202

RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8

对应的汇编代码如下

crash> dis smp_call_function_many

...

...

0xffffffff95916cb8 <smp_call_function_many+512>: pause

//call_single_data.flags值为1,所以testb两个操作数AND后的结果为1,RFLAGS的bit6标志位ZF应该为0,执行下面的jne导致一直在这死循环。当前flags为1,所以这里一直在jne和pause之间跳转。

0xffffffff95916cba <smp_call_function_many+514>: testb $0x1,0x20(%rcx)

0xffffffff95916cbe <smp_call_function_many+518>: jne 0xffffffff95916cb8 <smp_call_function_many+0x208>

....

...

crash> eval -b 0x202

hexadecimal: 202

decimal: 514

octal: 1002

binary: 0000000000000000000000000000000000000000000000000000001000000010

bits set: 9 1 //bit6 标志位ZF为0

crash>

smp_call_function_many+512到smp_call_function_many+518指令对应的c源码为csd_lock_wait函数:

static void csd_lock_wait(struct call_single_data *csd)

{

while (csd->flags & CSD_FLAG_LOCK)

cpu_relax();

}

csd的flags在call_single_data偏移0x20

crash> struct call_single_data.flags -xo

struct call_single_data {

[0x20] u16 flags;

}

crash>

testb $0x1,0x20(%rcx)指令对应的就是while (csd->flags & CSD_FLAG_LOCK),那么rcx寄存器的值ffff88043fd9adb8自然就是call_single_data 类型csd的地址,ffff88043fd9adb8对应的是cpu3的csd地址,也就是代表cpu2正在等待cpu3响应IPI信号。

我们先来看看cpu在收到IPI信号后会做些什么?

代码语言:javascript
复制
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
   ...
//遍历本cpu对应call_single_queue队列中的csd,也就是处理发送给本cpu的所有ipi请求
   llist_for_each_entry_safe(csd, csd_next, entry, llist) {
                csd->func(csd->info);//ipi回调函数,本问题中对应的是flush_tlb_func
                csd_unlock(csd);//处理完回调函数后清除csd的flags
        }
    ...    
}

前面的分析我们已经知道cpu2是给cpu1和cpu3发送了ipi请求,而cpu1和cpu3对应的cpu_tlbstate.state

值都为2:

crash> p cpu_tlbstate

PER-CPU DATA TYPE:

struct tlb_state cpu_tlbstate;

PER-CPU ADDRESSES:

[0]: ffff88043fc16500

[1]: ffff88043fc96500

[2]: ffff88043fd16500

[3]: ffff88043fd96500

crash> struct tlb_state.state ffff88043fc96500

state = 2

crash> struct tlb_state.state ffff88043fd96500

state = 2

crash>

代码语言:javascript
复制
#define TLBSTATE_OK     1
#define TLBSTATE_LAZY   2
tatic void flush_tlb_func(void *info)
{       
        struct flush_tlb_info *f = info;
                
        inc_irq_stat(irq_tlb_count);

        if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
                return;

        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
                if (f->flush_end == TLB_FLUSH_ALL)
                        local_flush_tlb();
                else if (!f->flush_end)
                        __flush_tlb_single(f->flush_start);
                else {
                        unsigned long addr;
                        addr = f->flush_start;
                        while (addr < f->flush_end) {
                                __flush_tlb_single(addr);
                                addr += PAGE_SIZE;
                        }
                }
        } else
                leave_mm(smp_processor_id());//cpu_tlbstate.state为2,执行这里。

}

cpu_tlbstate.state不为TLBSTATE_OK时flush_tlb_func调用leave_mm(smp_processor_id()),

因为死机时CPU2发送IPI给cpu1和cpu3,那么这里smp_processor_id()值就是1和3,对应的

tlb_state.active_mm分别为 0xffff88042ce99900和0xffff88042cfb0640:

crash> struct tlb_state.active_mm ffff88043fc96500

active_mm = 0xffff88042ce99900

crash> struct tlb_state.active_mm ffff88043fd96500

active_mm = 0xffff88042cfb0640

crash>

代码语言:javascript
复制
void leave_mm(int cpu)  
{               
        struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
                BUG();  
        if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
                cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
                load_cr3(swapper_pg_dir);
        }                       
}            

收到IPI请求的cpu处理完回调函数后会将cpu_tlbstate.active_mm的成员cpu_vm_mask_var

对应自己cpu号的位清零.

死机时cpu3对应cpu_tlbstate.active_mm的成员cpu_vm_mask_var的bit3位仍然为1

crash> struct mm_struct.cpu_vm_mask_var 0xffff88042cfb0640

cpu_vm_mask_var = 0xffff88042cfb0988

crash> rd 0xffff88042cfb0988

ffff88042cfb0988: 0000000000000008 ........

crash>

crash>

代码语言:javascript
复制
* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{               
        return mm->cpu_vm_mask_var; 
}    
#define cpumask_bits(maskp) ((maskp)->bits)
/**     
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointe
 */     
static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{               
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}       

我们再回过头来看看触发死机的cpu2的调用栈

rash> bt

PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"

....

[exception RIP: smp_call_function_many+514]

RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202

RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8

RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000

RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9

R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292

R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000

ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018

#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668

#11 [ffff88000399b878] flush_tlb_page at ffffffff8106e864

#12 [ffff88000399b898] ptep_clear_flush at ffffffff811c2524

#13 [ffff88000399b8d0] page_mkclean at ffffffff811bbf2e

....

代码语言:javascript
复制
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
{
        struct mm_struct *mm = vma->vm_mm;
    ...
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
                flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);             
        ...
}

static inline void flush_tlb_others(const struct cpumask *cpumask,
                                    struct mm_struct *mm,
                                    unsigned long start,
                                    unsigned long end)
{
        PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
}

void native_flush_tlb_others(const struct cpumask *cpumask,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long end)
{
        ...
        smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}

找出smp_call_function_many函数参数1 cpumask指针的地址

crash> whatis smp_call_function_many

void smp_call_function_many(const struct cpumask *, smp_call_func_t, void *, bool);

crash>

crash> dis -r ffffffff8106e668

...

0xffffffff8106e654 <native_flush_tlb_others+164>: mov $0x1,%ecx

0xffffffff8106e659 <native_flush_tlb_others+169>: mov $0xffffffff8106e4a0,%rsi

0xffffffff8106e660 <native_flush_tlb_others+176>: mov %r14,%rdi //r14和rdi存放的是smp_call_function_many函数参数1

0xffffffff8106e663 <native_flush_tlb_others+179>: callq 0xffffffff810f97a0 <smp_call_function_many>

0xffffffff8106e668 <native_flush_tlb_others+184>: jmp 0xffffffff8106e62a <native_flush_tlb_others+122>

crash> dis smp_call_function_many

0xffffffff810f97a0 <smp_call_function_many>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]

0xffffffff810f97a5 <smp_call_function_many+5>: push %rbp

0xffffffff810f97a6 <smp_call_function_many+6>: mov %rsp,%rbp

0xffffffff810f97a9 <smp_call_function_many+9>: push %r15

0xffffffff810f97ab <smp_call_function_many+11>: push %r14 //对r14进行压栈,此时r14存放的还是smp_call_function_many函数 //参数1

0xffffffff810f97ad <smp_call_function_many+13>: mov %rdx,%r14

0xffffffff810f97b0 <smp_call_function_many+16>: push %r13

0xffffffff810f97b2 <smp_call_function_many+18>: mov %rsi,%r13

0xffffffff810f97b5 <smp_call_function_many+21>: push %r12

0xffffffff810f97b7 <smp_call_function_many+23>: push %rbx

crash>bt -f

....

[exception RIP: smp_call_function_many+514]

RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202

RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8

RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000

RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9

R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292

R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000

ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018

ffff88000399b740: ffffea0010b1f842 ffff88000399b798

ffff88000399b750: 0000000000000292 ffff88000399b820

ffff88000399b760: 00000000000000fc ffffea00000c2100

ffff88000399b770: ffff88043fd19c80 ffffffff813227d9

ffff88000399b780: ffff88017a1ee000 0000000000000003

ffff88000399b790: ffff88043fd9adb8 0000000000000003

ffff88000399b7a0: 0000000000000004 0000000000000000

ffff88000399b7b0: ffffffffffffff10 ffffffff810f99a2

ffff88000399b7c0: 0000000000000010 0000000000000202

ffff88000399b7d0: ffff88000399b7e8 0000000000000018

ffff88000399b7e0: ffffffff810f9977 000000012c7e1370

ffff88000399b7f0: ffff88042cfb0988 ffff88042cfb0640

ffff88000399b800: 00007f4474b04000 0000000000000000

ffff88000399b810: ffff88042cfb0988 ffff880114338820

ffff88000399b820: ffff88000399b870 ffffffff8106e668

#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668

....

通过栈信息找到smp_call_function_many函数参数1的地址为0xffff88042cfb0988

crash> struct cpumask 0xffff88042cfb0988

struct cpumask {

bits = {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

}

crash>

cpu2的调用栈smp_call_function_many函数参数1的cpumask地址实际上跟前面分析的cpu3响应处理ipi请求后需要清除

其对应cpu掩码位的cpu_vm_mask_var是一个地址。

crash> struct mm_struct.cpu_vm_mask_var 0xffff88042cfb0640

cpu_vm_mask_var = 0xffff88042cfb0988

crash

所以实际上我们仅仅只要找到cpu2调用smp_call_function_many时传递的参数1 cpumask值就可以知道cpu2是在等待

哪个cpu响应其发送的ipi请求了。

smp_call_function_many函数第一个参数传入的cpumask对应地址的值代表

哪个cpu还未响应或正在处理IPI回调函数,也就是触发重启watchdog所在的cpu正在等待响应的cpu:

cfd_data:x的值来源于smp_call_function_many函数传入的参数1 cpumask,但是由于处理ipi请求的cpu在完成ipi请求后会清除smp_call_function_many函数传入的参数1 cpumask地址所保留的对应cpu位。所以cfd_data:x的cpumask代表死机前本cpu给哪几个cpu发送了IPI信号,而smp_call_function_many函数传入的参数1 cpumask地址的值代表本cpu正在等待哪个(些)cpu响应或者处理完成IPI请求,针对本问题对应信息如下:

cpu2给cpu1和cpu3发送了ipi请求

crash> p cfd_data:2

per_cpu(cfd_data, 2) = $15 = {

csd = 0x1adb8,

cpumask = 0xffff88017a1ee000

}

crash> struct cpumask 0xffff88017a1ee000

struct cpumask {

bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

}

crash>

cpu2正在等待cpu3完成ipi请求,cpu1已经完成cpu2发送的ipi请求,所以bit1已被清0:

crash> struct cpumask 0xffff88042cfb0988

struct cpumask {

bits = {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

}

crash>

我们再来看看cpu3此时积压了多少ipi请求未处理

crash> p call_single_queue:3

per_cpu(call_single_queue, 3) = $19 = {

first = 0xffff88043fd9adb8

}

crash> list 0xffff88043fd9adb8

ffff88043fd9adb8 // cpu2发送过来的ipi请求

ffff88043fd979c0//

crash>

cpu3为什么不处理IPI请求?

cpu3此时运行着空闲swapper进程:

crash> bt -c 3

PID: 0 TASK: ffff88017a203f40 CPU: 3 COMMAND: "swapper/3"

bt: page excluded: kernel virtual address: ffffffffffffffff type: "cpu_online_map"

#0 [ffff88043fd85e48] crash_nmi_callback at ffffffff8104fd61

#1 [ffff88043fd85e58] nmi_handle at ffffffff816ad427

#2 [ffff88043fd85eb0] do_nmi at ffffffff816ad65d

#3 [ffff88043fd85ef0] end_repeat_nmi at ffffffff816ac8d3

[exception RIP: native_safe_halt+6]

RIP: ffffffff816ab4a6 RSP: ffff88017a23bea8 RFLAGS: 00000286

RAX: 00000000ffffffed RBX: ffffffff81b1c820 RCX: 0100000000000000

RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000046

RBP: ffff88017a23bea8 R8: 0000000000000000 R9: 0000000000000000

R10: 0000000000000000 R11: 0004ef1ee033ca80 R12: 0000000000000003

R13: ffff88017a238000 R14: ffff88017a238000 R15: ffff88017a238000

ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018

--- <NMI exception stack> ---

#4 [ffff88017a23bea8] native_safe_halt at ffffffff816ab4a6

#5 [ffff88017a23beb0] default_idle at ffffffff816ab33e

#6 [ffff88017a23bed0] arch_cpu_idle at ffffffff81035006

#7 [ffff88017a23bee0] cpu_startup_entry at ffffffff810e7bca

#8 [ffff88017a23bf28] start_secondary at ffffffff81051af6

crash>

cpu3运行队列里却是有进程在等cpu资源:

crash> runq

...

CPU 3 RUNQUEUE: ffff88043fd96cc0

CURRENT: PID: 0 TASK: ffff88017a203f40 COMMAND: "swapper/3"

RT PRIO_ARRAY: ffff88043fd96e50

[no tasks queued]

CFS RB_ROOT: ffff88043fd96d68

[120] PID: 30118 TASK: ffff880012ab8000 COMMAND: "barad_agent"

[120] PID: 30121 TASK: ffff8800368dbf40 COMMAND: "java"

crash>

cpu3并未出现死锁,但是却没有响应ipi请求和调度运行可运行队列中的进程,由于是kvm虚拟机,因此推测是host主机出现异常导致子机vcpu3没有获取cpu资源导致的,排查host主机日志后发现是由于磁盘故障引起该子机对应的qemu进程D状态引起的。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档