crash> bt
PID: 31918 TASK: ffff8820117a8240 CPU: 10 COMMAND: "test_gifconf"
#0 [ffff88202e597ae0] machine_kexec at ffffffff81020ad2
#1 [ffff88202e597b30] crash_kexec at ffffffff81088920
#2 [ffff88202e597c00] oops_end at ffffffff8139f8c0
#3 [ffff88202e597c20] __bad_area_nosemaphore at ffffffff8102ed15
#4 [ffff88202e597ce0] page_fault at ffffffff8139eb3f
[exception RIP: inet_gifconf+139]
RIP: ffffffff813512cb RSP: ffff88202e597d98 RFLAGS: 00010246
RAX: ffff88202e597d98 RBX: ffff88202e597d98 RCX: 0000000000000000
RDX: 0000000000000003 RSI: ffff8820015ad173 RDI: ffff88202e597d98
RBP: ffff8820015ad140 R8: 00007fac1cf40300 R9: 00007fac1cf534c0
R10: 0000000000000000 R11: ffffffff81349980 R12: 0000000000000028
R13: 00007fff8b2feeb0 R14: 00000000000000c8 R15: ffff88202e597da8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#5 [ffff88202e597e00] dev_ioctl at ffffffff812fc1a2
#6 [ffff88202e597ec0] sock_ioctl at ffffffff812e654d
#7 [ffff88202e597ef0] vfs_ioctl at ffffffff8110f06f
#8 [ffff88202e597f10] do_vfs_ioctl at ffffffff8110f4fb
#9 [ffff88202e597f40] sys_ioctl at ffffffff8110f611
#10 [ffff88202e597f80] system_call_fastpath at ffffffff81002f7b
RIP: 00007fac1ccb3017 RSP: 00007fff8b2feea0 RFLAGS: 00000217
RAX: 0000000000000010 RBX: ffffffff81002f7b RCX: 00007fac1ccb3017
RDX: 00007fff8b2ff0b0 RSI: 0000000000008912 RDI: 0000000000000003
RBP: 0000000000000000 R8: 00007fac1cf40300 R9: 00007fac1cf534c0
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff8b2ff1a0
R13: 0000000000400530 R14: 00007fff8b2ff0d0 R15: 00000000004006f0
ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
crash>
static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
{
if (ifa->ifa_label)
strcpy(ifr.ifr_name, ifa->ifa_label);
else
strcpy(ifr.ifr_name, dev->name);
}
crash> dis inet_gifconf
0xffffffff81351290 <inet_gifconf+0x50>: lea 0x33(%rbp),%rsi
0xffffffff81351294 <inet_gifconf+0x54>: mov %rsp,%rdi
0xffffffff81351297 <inet_gifconf+0x57>: movq $0x0,(%rbx)
0xffffffff8135129e <inet_gifconf+0x5e>: movq $0x0,0x8(%rbx)
0xffffffff813512a6 <inet_gifconf+0x66>: movq $0x0,0x10(%rbx)
0xffffffff813512ae <inet_gifconf+0x6e>: movq $0x0,0x18(%rbx)
0xffffffff813512b6 <inet_gifconf+0x76>: movq $0x0,0x20(%rbx)
0xffffffff813512be <inet_gifconf+0x7e>: callq 0xffffffff811e74a0 <strcpy>
0xffffffff813512c3 <inet_gifconf+0x83>: movw $0x2,(%r15)//这所以找到RIP是这条指令地址,是根据后面的分析反推的
0xffffffff813512c9 <inet_gifconf+0x89>: mov 0x20(%rbp),%eax
0xffffffff813512cc <inet_gifconf+0x8c>: mov $0x28,%edx
0xffffffff813512d1 <inet_gifconf+0x91>: mov %rsp,%rsi
0xffffffff813512d4 <inet_gifconf+0x94>: mov %r13,%rdi
0xffffffff813512d7 <inet_gifconf+0x97>: mov %eax,0x4(%r15)
crash> struct in_ifaddr -o
struct in_ifaddr {
[0x0] struct in_ifaddr *ifa_next;
[0x8] struct in_device *ifa_dev;
[0x10] struct rcu_head rcu_head;
[0x20] __be32 ifa_local;
[0x24] __be32 ifa_address;
[0x28] __be32 ifa_mask;
[0x2c] __be32 ifa_broadcast;
[0x30] unsigned char ifa_scope;
[0x31] unsigned char ifa_flags;
[0x32] unsigned char ifa_prefixlen;
[0x33] char ifa_label[16];
}
SIZE: 0x48
crash>
RBP: ffff8820015ad140 对应struct in_ifaddr *ifa;
RSI: ffff8820015ad173 对应ifa-> ifa_label
crash> struct in_ifaddr ffff8820015ad140
struct in_ifaddr {
ifa_next = 0xffff882003da3840,
ifa_dev = 0xffff88103757ce00,
rcu_head = {
next = 0x0,
func = 0
},
ifa_local = 0x100007f,
ifa_address = 0x100007f,
ifa_mask = 0xff,
ifa_broadcast = 0xffffff7f,
ifa_scope = 0xfe,
ifa_flags = 0x0,
ifa_prefixlen = 0x8,
ifa_label = "lo\000\000\000\000\000\000\000\000\000\000\000\000\000"
}
crash> eval ffff8820015ad173-ffff8820015ad140
hexadecimal: 33
decimal: 51
octal: 63
binary: 0000000000000000000000000000000000000000000000000000000000110011
crash>
ifr.ifr_name 对应到RDI: ffff88202e597d98
crash> rd ffff88202e597d98
ffff88202e597d98: 0000000000006f6c lo......
crash> ascii 0000000000006f6c
0000000000006f6c: lo<NUL><NUL><NUL><NUL><NUL><NUL>
crash>
从上面信息可以确定0xffffffff813512be <inet_gifconf+0x7e>: callq 0xffffffff811e74a0 <strcpy>这条指令已经执行完成
通过R15所指向的寄存器值确定0xffffffff813512c3 <inet_gifconf+0x83>: movw $0x2,(%r15)这条指令还没有被执行:
R15: ffff88202e597da8
crash> rd ffff88202e597da8
ffff88202e597da8: 000000000000458b .E......
crash>
因为执行0xffffffff813512be <inet_gifconf+0x7e>: callq 0xffffffff811e74a0 <strcpy>时需要把下一条指令进行压栈,callq 0xffffffff811e74a0 <strcpy>的下一条指令为:
0xffffffff813512c3 <inet_gifconf+0x83>: movw $0x2,(%r15)
因为x86栈空间是从高地址往地址延伸,栈地址rsp从栈顶往栈底(最低地址)延伸,threadinfo存放在栈底,所以通过threadinfo ffff88202e596000地址可以从栈空间的最低地址往上查看整个栈信息:
# crash vmlinux-2.6.32.59-0.7-default.debug vmlinux-2.6.32.59-0.7-default.gz vmcore_for_panic_on_inet_gitconf
COMMAND: "test_gifconf"
TASK: ffff8820117a8240 [THREAD_INFO: ffff88202e596000]
CPU: 10
STATE: TASK_RUNNING (PANIC)
task_struct数据结构中的stack成员指向thread_union结构(Linux内核通过thread_union联合体来表示进程的内核栈)
crash> thread_union
union thread_union {
struct thread_info thread_info;
long unsigned int stack[1024];
}
SIZE: 0x2000
crash>
因此,通过task_stuct的stack成员可以找到thread_info的地址为0xffff88202e596000
crash> struct task_struct ffff8820117a8240 | grep stack
stack = 0xffff88202e596000,
crash> struct thread_info ffff88202e596000 | grep task
task = 0xffff8820117a8240,
crash>
crash> thread_union 0xffff88202e596000 | grep -w thread_info -A1
thread_info = {
task = 0xffff8820117a8240,
crash>
crash> x/1024xg 0xffff88202e596000
0xffff88202e596000: 0xffff8820117a8240 0xffffffff81814200
0xffff88202e596010: 0x0000001000000000 0x000000000000000a
……………………………
…………………………..
0xffff88202e597d80: 0xffff88202e597d98 0x0000000000000018
0xffff88202e597d90: 0xffffffff813512c3 0x0000000000006f6c // ffff88202e597d98
0xffff88202e597da0: 0x0000000000000000 0x000000000000458b
通过栈信息可以知道栈里面函数的返回地址0xffffffff813512c3没有被破坏
因为当前栈指针寄存器rsp的值为RSP:ffff88202e597d98,并且栈是从高地址往低地址延伸的,因此可以知道代码刚从strcpy返回并且把函数返回地址从栈里取出放置到RIP中,所以正常情况下RIP的值应该是
0xffffffff813512c3,但是当前RIP: 0010:[<ffffffff813512cb>]。在调用strcpy前执行了一条0xffffffff81351294 <inet_gifconf+0x54>: mov %rsp,%rdi指令,我们从触发vmcore时rdi的值为RDI: ffff88202e597d98也可以知道
RSP在调用strcpy前就是ffff88202e597d98,进入strcpy后因为系统在调用函数时自动做了一次压栈动作,所以进入strcpy时RSP指向地址0xffff88202e597d90: 0xffffffff813512c3。
所以下一条本来要执行的指令应该是0xffffffff813512c3 <inet_gifconf+0x83>: movw $0x2,(%r15),但是函数返回时RIP装载的却是是ffffffff813512cb,因为系统重启了多次,但是通过vtop查看多个vmcore对比出现问题时物理地址都是不相同的,排除是内存问题。retq是cpu指令,因此推测是cpu异常导致的问题。虽然cpu异常概率很小,但是只要信息充分就大但相信自己的判断吧。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。