专栏首页随笔+systemtap实现定位内核丢包工具
原创

systemtap实现定位内核丢包工具

内核丢弃数据包的点非常多,如何快速定位是哪个地方丢包了呢?工欲善其事必先利其器。

基于dropwatch2.stp工具改改造了个工具来揪出丢包位置。driowatch2.stp的实现利用了内核的kfree_skb trace探测点:

内核通过kfree_skb释放skb,kfree_skb函数中已经埋下了trace点,并且通过__builtin_return_address(0)记录下了调用kfree_skb的函数地址并传给location参数:

/**
 *      kfree_skb - free an sk_buff
 *      @skb: buffer to free
 *
 *      Drop a reference to the buffer and free it if the usage count has
 *      hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return;
        if (likely(atomic_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!atomic_dec_and_test(&skb->users)))
                return;
        trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(kfree_skb);


/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,         skbaddr         )
                __field(        void *,         location        )
                __field(        unsigned short, protocol        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->protocol = ntohs(skb->protocol);
        ),

        TP_printk("skbaddr=%p protocol=%u location=%p",
                __entry->skbaddr, __entry->protocol, __entry->location)
);

因此可以利用systemtap kernel.trace来跟踪kfree_skb.

#!/usr/bin/stap --all-modules
%{
#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/textsearch.h>
#include <net/checksum.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <linux/skbuff.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/udp.h>
#include <uapi/linux/tcp.h>
%}
############################################################
# trace_net_drop.stp
# An example script to mimic the behavior of the dropwatch utility
# Need install kernel-debuginfo and kernel-debuginfo-common before running this script
# Default reports every 5 seconds with timestamp
# Usage example: //-g for guru mode
#stap -g -v --all-modules  trace_net_drop.stp tcp daddr=10.0.0.14 dport=22 //dump all stack when skb dest addr/port 10.0.0.14/22
#stap -g -v --all-modules  trace_net_drop.stp tcp saddr=1.1.1.1 sport=1000 daddr=2.2.2.2 dport=22 match=nf_hook_slow //just dump stack when kfree_skb is called by function "nf_hook_slow"
#stap -g -v --all-modules  trace_net_drop.stp tcp daddr=10.0.0.14 dport=22 filter=tcp_rcv_state_process //dump any backtrace except function kfree_skb is called by "tcp_rcv_state_process"
#stap -g -v --all-modules  -B CONFIG_MODVERSIONS=y trace_net_drop.stp tcp daddr=10.0.0.14 dport=22 filter=tcp_rcv_state_process //add '-B CONFIG_MODVERSIONS=y' if kernel config enable Module versioning support
############################################################
function get_skb_saddr:string(skb:long)
%{
     int ret=-1;
     unsigned int src_port = 0;
     struct udphdr *udp_header;
     struct tcphdr *tcp_header;
     struct sk_buff *skb= (struct sk_buff *)STAP_ARG_skb;
     struct iphdr *ip_header;
     unsigned int src_ip=0;
     if(!skb)
     {
         goto EXIT_F;
     }
     ip_header = (struct iphdr *)skb_network_header(skb);
     if(!ip_header)
     {
       goto EXIT_F;
     }
     src_ip = (unsigned int)ip_header->saddr;
     // printk(KERN_DEBUG "IP addres = %pI4  DEST = %pI4\n", &src_ip, &dest_ip);
EXIT_F:
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%d.%d.%d.%d",(unsigned int)((unsigned char *)&src_ip)[0],
              (unsigned int)((unsigned char *)&src_ip)[1],(unsigned int)((unsigned char *)&src_ip)[2],(unsigned int)((unsigned char *)&src_ip)[3]);
%}



function get_skb_daddr:string(skb:long)
%{
     int ret=-1;
     struct udphdr *udp_header;
     struct tcphdr *tcp_header;
     struct sk_buff *skb= (struct sk_buff *)STAP_ARG_skb;
     struct iphdr *ip_header;
     unsigned int dst_ip=0;
     if(!skb)
     {
         goto EXIT_F;
     }
     ip_header = (struct iphdr *)skb_network_header(skb);
     if(!ip_header)
     {
       goto EXIT_F;
     }
     dst_ip = (unsigned int)ip_header->daddr;
EXIT_F:
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%d.%d.%d.%d",(unsigned int)((unsigned char *)&dst_ip)[0],
            (unsigned int)((unsigned char *)&dst_ip)[1],(unsigned int)((unsigned char *)&dst_ip)[2],(unsigned int)((unsigned char *)&dst_ip)[3]);
%}

function get_skb_sport:string(skb:long)
%{
     int ret=-1;
     unsigned int src_port = 0;
     struct udphdr *udp_header;
     struct tcphdr *tcp_header;
     struct sk_buff *skb= (struct sk_buff *)STAP_ARG_skb;
     struct iphdr *ip_header;
     if(!skb)
     {
         goto EXIT_F;
     }
     ip_header = (struct iphdr *)skb_network_header(skb);
     if(!ip_header)
     {
       goto EXIT_F;
     }

     if (ip_header->protocol==17) {
            udp_header = (struct udphdr *)skb_transport_header(skb);
            src_port = (unsigned int)ntohs(udp_header->source);
        } else if (ip_header->protocol == 6) {
            tcp_header = (struct tcphdr *)skb_transport_header(skb);
            src_port = (unsigned int)ntohs(tcp_header->source);
        }
//     printk("src_port=%d\r\n",src_port);
EXIT_F:
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%d",src_port);
%}


function get_skb_dport:string(skb:long)
%{
     int ret=-1;
     unsigned int dst_port = 0;
     struct udphdr *udp_header;
     struct tcphdr *tcp_header;
     struct sk_buff *skb= (struct sk_buff *)STAP_ARG_skb;
     struct iphdr *ip_header;
     if(!skb)
     {
         goto EXIT_F;
     }
     ip_header = (struct iphdr *)skb_network_header(skb);
     if(!ip_header)
     {
       goto EXIT_F;
     }

     if (ip_header->protocol==17) {
            udp_header = (struct udphdr *)skb_transport_header(skb);
            dst_port = (unsigned int)ntohs(udp_header->dest);
        } else if (ip_header->protocol == 6) {
            tcp_header = (struct tcphdr *)skb_transport_header(skb);
            dst_port = (unsigned int)ntohs(tcp_header->dest);
        }
EXIT_F:
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%d",dst_port);
%}

function get_skb_ipproto:string(skb:long)
%{
     char *ipproto = "NONE";
     struct sk_buff *skb= (struct sk_buff *)STAP_ARG_skb;
     struct iphdr *ip_header;
     if(!skb)
     {
         goto EXIT_F;
     }
     ip_header = (struct iphdr *)skb_network_header(skb);
     if(!ip_header)
     {
       goto EXIT_F;
     }

     if (ip_header->protocol == 6) {
            ipproto="TCP";
        }
     else if (ip_header->protocol == 17) {
            ipproto="UDP";
        }
     else if (ip_header->protocol == 1) {
            ipproto="ICMP";//IPPROTO_ICMP
        }

EXIT_F:
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%s",ipproto);
%}

global addr = "all"
global port = "all"
global saddr = "all"
global sport = "all"
global daddr = "all"
global dport = "all"
global match = "all"
global filter = "none"
global WatchIpproto = "ALL"
global kfree_skb_stack
global kfree_skb_location
global interval=5 # default interval between output
global timeout=0
global BackTrace = 0
/*function get_param_val:string (mystr:string) %{
     char *ptr;
     int  ch = '=';
     char *strargs = STAP_ARG_mystr;
     ptr=strchr(strargs , ch);
     snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%s",ptr + 1);
%}*/


function usage (msg:string)
{
   printf("%s:\n\n",msg);
   printf("\tall|tcp|udp|icmp: trace proto\n")
   printf("\taddr=ip address:ip address\n")
   printf("\tsaddr=ip source address:ip source address\n")
   printf("\tdaddr=ip dest address:ip destination address\n")
   printf("\tport=port: ip port\n")
   printf("\tsport=source port:source port\n")
   printf("\tdport=dest port:dest port\n")
   printf("\tmatch=<all|kernel function name>:match a specific function or any function\n")
   printf("\tfilter=<all|kernel function name>:filter a specific function or any function\n")
   printf("\tbt: print call trace\n")
   printf("\tinterval=second:Dump trace every 'interval' second\n")
   printf("\texample:\n")
   printf("\ttrace_net_drop.stp tcp saddr=1.1.1.1 sport=5000 daddr=2.2.2.2 dport=80\n\n");
   exit();
}


function print_header (msg:string)
{
    printf("%-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s\n","IP PROTO","SRC or DST ADDRESS","SRC or DST PORT","Source Address",
                    "Dest Address","Source Port","Dest Port","match function","filter function","INTERVAL(sec)")
    printf("%-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20s  %-20d\n",WatchIpproto,addr,port,saddr,daddr,sport,dport,match,filter,interval)
}

function process_cmdline:long ()
{
    for (i=1; i <= argc; i++) {
        argument = tokenize(argv[i], "=")
       if (argument == "help") {
           usage("Usage");
           exit();
         }
       else if (argument == "all") {
           WatchIpproto = "ALL";
           continue;
         }
       else if (argument == "tcp") {
           WatchIpproto = "TCP";
           continue;
         }
       else if (argument == "udp") {
           WatchIpproto = "UDP";
           continue;
         }
       else if (argument == "icmp") {
           WatchIpproto = "ICMP";
           continue;
        }
       else if (argument == "addr") {
           argv[i]="";
           addr = tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "port") {
           argv[i]="";
           port=tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "saddr") {
           argv[i]="";
           saddr=tokenize(argv[i], "=");
           continue;
         }
       else if (argument == "daddr") {
           argv[i]="";
           daddr=tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "sport") {
           argv[i]="";
           sport=tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "dport") {
           argv[i]="";
           dport=tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "match") {
           argv[i]="";
           match=tokenize(argv[i], "=");
           continue;
        }
       else if (argument == "filter") {
           argv[i]="";
           filter=tokenize(argv[i], "=");
           continue;
        }
      else if (argument == "interval") {
           argv[i]="";
           interval=strtol(tokenize(argv[i], "="),10);
           continue;
        }
      else if (argument == "bt") {
            BackTrace=1;
        }
      else
        usage("process cmdline fail")

    }

    print_header("");
}

probe begin
{
//    printf("param num:%s,%d\r\n",@#,argc)
    if (@1 == "all" || @1 == "tcp" || @1 =="udp" || @1 == "icmp" || @1 =="help") {
         process_cmdline();
    }
    else {
       usage("error cmdline");

    }

   printf("Monitoring for dropped packets\n")
}
probe end { printf("Stopping dropped packet monitor\n") }
# increment a drop counter for every location we drop at
probe kernel.trace("kfree_skb") {

   if(WatchIpproto != "ALL")
   {
      skb_ip_proto=get_skb_ipproto($skb)
      if(skb_ip_proto != WatchIpproto)
         next
   }

   if(addr != "all")
   {
      skb_src_ip=get_skb_saddr($skb)
      skb_dst_ip=get_skb_daddr($skb)
      if(addr != skb_src_ip && addr != skb_dst_ip)
        next
   }
   if(port != "all")
   {
      skb_src_port=get_skb_sport($skb)
      skb_dst_port=get_skb_dport($skb)
      if(port != skb_src_port && port != skb_dst_port)
        next
   }
   if(saddr != "all")
   {
     skb_src_ip=get_skb_saddr($skb)
     if(saddr != skb_src_ip)
        next
   }
   if(daddr != "all")
   {
      skb_dst_ip=get_skb_daddr($skb)
      if(daddr != skb_dst_ip)
        next
   }
   if(sport != "all")
   {
      skb_src_port=get_skb_sport($skb)
      if(sport != skb_src_port)
        next
   }
   if(dport != "all")
   {
      skb_dst_port=get_skb_dport($skb)
      if(dport != skb_dst_port)
        next
   }
 //   locations[$location] <<< 1 //systemtap Statistical aggregate
   if(match == "all" && filter== "none")
   {
      if(BackTrace)
        kfree_skb_stack[backtrace()] <<< 1
      else
        kfree_skb_location[$location] <<< 1
   }
   else if(symname($location)==match)
   {
     if(BackTrace)
        kfree_skb_stack[backtrace()] <<< 1
     else
        kfree_skb_location[$location] <<< 1
   }
   else if(symname($location)!=filter && filter!="none")
  {
    if(BackTrace)
        kfree_skb_stack[backtrace()] <<< 1
     else
        kfree_skb_location[$location] <<< 1
  }


}
#Default every 5 seconds report our drop locations
probe timer.sec(1)
{
   if(++timeout != interval)
     next

   timeout=0
   printf("\n====== %s ======\n", ctime(gettimeofday_s()))
   if(BackTrace)
   {
    // add a single plus (+) or minus (-) operator after the VAR or the ARRAY 
     //identifier, the iteration order will be sorted by the ascending or descending 
     //index or value.
     foreach (bt in kfree_skb_stack-) {
     
        printf("%d packets dropped at stack:\n",@count(kfree_skb_stack[bt]))
        print_syms(bt)
      }
     delete kfree_skb_stack
   }
   else
   {
     foreach (lt in kfree_skb_location-) {
       printf("%d packets dropped at %s\n",@count(kfree_skb_location[lt]), symname(lt))

     }
     delete kfree_skb_location
   }

}
### trace_net_drop.stp ends ###

     

执行方法示例:

stap -g -v --all-modules trace_net_drop.stp tcp daddr=10.0.0.14 dport=22

注:加上-g是使能了systamtap guru mode,在部分发行商内核版本上运行systemtap脚本如果加上-g选项会报"Invalid module format"的错误信息并且无法正常运行脚本,可通过如下方式解决:

(1)stap运行时加上-B CONFIG_MODVERSIONS=y选项 stap -g -v --all-modules -B CONFIG_MODVERSIONS=y ./trace_net_drop.stp tcp

或者 (2)make menuconfig—>Enable loadable module support —>[*] Module versioning support,把“Module versioning support”前面的星号去掉,禁止版本检测选项既可。

原创声明,本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

登录 后参与评论
0 条评论

相关文章

  • ​内核调试技巧--systemtap定位丢包原因

    作者:wqiangwang,腾讯 TEG 后台开发工程师 内核收发包,可能会由于backlog队列满、内存不足、包校验失败、特性开关如rpf、路由不可达、端口未...

    腾讯技术工程官方号
  • 程序员精进之路:性能调优利器--火焰图

    作者:厉辉,腾讯 CSIG 后台开发工程师 本文主要分享火焰图使用技巧,介绍 systemtap 的原理机制,如何使用火焰图快速定位性能问题原因,同时加深对 s...

    腾讯技术工程官方号
  • 动态跟踪分析Nginx-工具介绍篇

    之前写过一篇文章Nginx调试必备,介绍了几种调试Nginx的工具,包括echo、lua、njs,这些工具,都只是方便输出或者打印日志输出一些变量等,方便运维人...

    李俊鹏
  • Linux下systemtap和火焰图介绍及安装

    SystemTap 是对 Linux 内核监控和跟踪的工具,详细的介绍及说明见官网。

    用户8705059
  • 内核调试技巧-逆向寻踪,揭开 LACP 协议流程的神秘面纱

    作者:wqiangwang,腾讯 TEG 后台开发工程师 本文通过“Kni 映射到内核的接口未能发送 LACP 报文导致 bond 不能聚合”这个问题,来探索内...

    腾讯技术工程官方号
  • SystemTap

    Systemtap 使用了类似于 awk 和 C 语言的脚本语言(类似于 Dtrace 的 D 语言)。

    hotarugali
  • 容器网络防火墙状态异常导致丢包排查记录

    作者杨玉玺,2011年至今一直从事底层网络研发,目前就职腾讯云 TKE 团队,专注 K8s 底层网络。先后就职于阿里云、金山云从事 VPC 虚拟化网络研发,对高...

    腾讯云原生
  • TCP是否会乱序

    问题 TCP客户端发送数据一般这样写 发送数据调用的是write函数,第一个参数是表示socket的文件指针,后面是要传送的数据指针和数据长度。如果数据长度超过...

    企鹅号小编
  • 高性能:6-bpftrace工具介绍【bpf performance tools读书笔记】

    bpftrace是基于BPF和BCC构建的开源跟踪程序。与BCC一样,bpftrace附带了许多性能工具和支持文档。但是,它还提供了高级编程语言,使您可以创建功...

    二狗不要跑
  • 【云原生技术研究】 从bpftrace看如何利用eBPF实现内核追踪

    bpftrace提供了一种快速利用eBPF实现动态追踪的方法,可以作为简单的命令行工具或者入门级编程工具来使用。本文以bpftrace为例,介绍如何利用eBPF...

    绿盟科技研究通讯
  • K8s容器网络防火墙状态异常导致丢包排查记录

    腾讯内部某业务在容器场景上遇到了一个比较诡异的网络问题,在容器内使用GIT,SVN工具从内部代码仓库拉取代码偶发性卡顿失败,而在容器所在的Node节点使用同样版...

    CNCF
  • 由STGW下载慢问题引发的网络传输学习之旅

    导语:本文分享了笔者现网遇到的一个文件下载慢的问题。最开始尝试过很多办法,包括域名解析,网络链路分析,AB环境测试,网络抓包等,但依然找不到原因。然后利用网络命...

    腾讯技术工程官方号
  • 掌握运维必备技能--问题故障定位

    a. on-CPU:执行中,执行中的时间通常又分为用户态时间user和系统态时间sys。

    用户6543014
  • [linux][systemtap]使用systemtap分析qemu发生crash的原因

    前言: 在《[linux][pthread]qemu的一次pthread create失败的分析》中分析了pthread失败的原因以及解决方法。修改了pidma...

    皮振伟
  • 记一次有惊无险的丢包调试经历

    某个项目把服务器从 CentOS 操作系统从 5 升级到了 7(3.10.0-693),一切都很顺利,直到我在服务器上闲逛的时候,无意间发现了一个「大问题」:网...

    LA0WAN9
  • 云服务器网络延迟与丢包问题定位(mtr工具)

    本文提供视频讲解,详细见地址:https://www.bilibili.com/video/BV1ya4y1J77C

    研究僧

扫码关注云+社区

领取腾讯云代金券