使用uml调试tcp socket的TCP_CORK
1,准备这么一个源文件,并编译得到对应的执行程序tcp_cork_test放到uml内:
[root@localhost stub]# cat tcp_cork_test.c /** * gcc -Wall -g -o tcp_cork_test tcp_cork_test.c */ #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <errno.h> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <netinet/tcp.h> //char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\nHost: 127.0.0.1\r\nAccept: */*\r\n\r\n"; char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n"; int main(int argc, char *const *argv) { int sockfd; int on; struct sockaddr_in server_addr; if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) { fprintf(stderr, "Socket error,%s\n", strerror(errno)); return -1; } bzero(&server_addr, sizeof(server_addr)); server_addr.sin_family = AF_INET; server_addr.sin_port = htons(80); if (!inet_aton("10.0.0.1", &server_addr.sin_addr)) { fprintf(stderr, "Bad address:%s\n", strerror(errno)); close(sockfd); return -1; } on = 1; if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno)); close(sockfd); return -1; } if (connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr)) == -1) { fprintf(stderr, "Connect Error:%s\n", strerror(errno)); close(sockfd); return -1; } on = 0; if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno)); close(sockfd); return -1; } on = 1; if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno)); close(sockfd); return -1; } write(sockfd, req_header, strlen(req_header)); getchar(); on = 0; if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno)); close(sockfd); return -1; } close(sockfd); return 0; } [root@localhost stub]# gcc -Wall -g -o tcp_cork_test tcp_cork_test.c [root@localhost stub]# scp -P 22 tcp_cork_test [email protected]:/home/ [email protected]'s password: tcp_cork_test 100% 14KB 13.6KB/s 00:00 [root@localhost stub]#
2,在host机器里启动对应的socket服务端,比如运行一个nginx,以便uml内的测试程序tcp_cork_test连接过来:
[root@localhost stub]# netstat -natp | grep nginx tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN 879/nginx [root@localhost stub]#
3,根据源文件tcp_cork_test.c代码来看(先把connect()数据立即发送出去,然后堵塞write()数据,函数getchar()调用用于停顿),我这里准备要测试TCP_CORK是否能完全把socket堵塞住,也就是说我只写出“strlen(req_header)”的数据(这个长度明显不到一个MSS),然后一直等待,在这个等待的过程中,看内核(版本为3.4.4)有没有可能会把TCP_CORK“堵塞”的这一小段数据自动发送出去,从而破坏TCP_CORK原本的“堵塞”特性?根据事先的内核代码分析来看,这个问题的答案是有可能的。被TCP_CORK“堵塞”的数据有可能会在需要进行TCP窗口探测时发送出去,具体的入口函数为tcp_write_timer(),所以这里做一下验证。
将uml主进程绑定到gdb(记得做设置:set follow-fork-mode parent),在函数tcp_write_timer()处下断点:
(gdb) set follow-fork-mode parent (gdb) b tcp_write_timer Breakpoint 1 at 0x81df497: file net/ipv4/tcp_timer.c, line 449. (gdb) c Continuing.
然后在uml里(注意:不能远程连接到uml控制台时,那样的话会有很多其它数据包干扰,但通过在host机器里执行./linux…后启动进入的那个控制台无问题,即便host机器是远程接入的)执行tcp_cork_test程序:
[root@localhost home]# ./tcp_cork_test
host里attach到uml的gdb被断了下来:
(gdb) set follow-fork-mode parent (gdb) b tcp_write_timer Breakpoint 1 at 0x81df497: file net/ipv4/tcp_timer.c, line 449. (gdb) c Continuing. Detaching after fork from child process 9933. Breakpoint 1, tcp_write_timer (data=290971712) at net/ipv4/tcp_timer.c:449 449 { (gdb) bt 4 #0 tcp_write_timer (data=290971712) at net/ipv4/tcp_timer.c:449 #1 0x08076def in call_timer_fn (h=0x82ae8c4) at kernel/timer.c:1115 #2 __run_timers (h=0x82ae8c4) at kernel/timer.c:1176 #3 run_timer_softirq (h=0x82ae8c4) at kernel/timer.c:1364 (More stack frames follow...) (gdb)
逐一跟踪(由于有一些优化,所以gdb显示的代码有一点点乱序):
(gdb) n 455 if (sock_owned_by_user(sk)) { (gdb) 461 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) (gdb) 464 if (time_after(icsk->icsk_timeout, jiffies)) { (gdb) 472 switch (event) { (gdb) 470 icsk->icsk_pending = 0; (gdb) 472 switch (event) { (gdb) 477 tcp_probe_timer(sk); (gdb) s tcp_probe_timer (data=290971712) at net/ipv4/tcp_timer.c:274 274 if (tp->packets_out || !tcp_send_head(sk)) { (gdb) n 296 if (sock_flag(sk, SOCK_DEAD)) { (gdb) 294 max_probes = sysctl_tcp_retries2; (gdb) 296 if (sock_flag(sk, SOCK_DEAD)) { (gdb) 305 if (icsk->icsk_probes_out > max_probes) { (gdb) 309 tcp_send_probe0(sk); (gdb) s tcp_send_probe0 (sk=0x1157e040) at net/ipv4/tcp_output.c:2842 2842 { (gdb) n 2847 err = tcp_write_wakeup(sk); (gdb) s tcp_write_wakeup (sk=0x1157e040) at net/ipv4/tcp_output.c:2796 2796 { (gdb) n 2800 if (sk->sk_state == TCP_CLOSE) (gdb) 2803 if ((skb = tcp_send_head(sk)) != NULL && (gdb) 2804 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { (gdb) 2803 if ((skb = tcp_send_head(sk)) != NULL && (gdb) 2806 unsigned int mss = tcp_current_mss(sk); (gdb) 2807 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; (gdb) 2809 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) (gdb) 2807 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; (gdb) 2809 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) (gdb) 2816 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || (gdb) 2822 } else if (!tcp_skb_pcount(skb)) (gdb) 2825 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; (gdb) 2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); (gdb) s 2826 TCP_SKB_CB(skb)->when = tcp_time_stamp; (gdb) 2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); (gdb) 2826 TCP_SKB_CB(skb)->when = tcp_time_stamp; (gdb) 2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); (gdb) tcp_transmit_skb (sk=0x1157e040, skb=0x11778e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809 809 BUG_ON(!skb || !tcp_skb_pcount(skb)); (gdb) p skb->data $1 = ( unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>... (gdb) n 798 { (gdb) 809 BUG_ON(!skb || !tcp_skb_pcount(skb)); (gdb) 814 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) (gdb) 815 __net_timestamp(skb); (gdb) 817 if (likely(clone_it)) { (gdb) 818 if (unlikely(skb_cloned(skb))) (gdb) 821 skb = skb_clone(skb, gfp_mask); (gdb) 822 if (unlikely(!skb)) (gdb) 828 tcb = TCP_SKB_CB(skb); (gdb) 829 memset(&opts, 0, sizeof(opts)); (gdb) 828 tcb = TCP_SKB_CB(skb); (gdb) 829 memset(&opts, 0, sizeof(opts)); (gdb) 831 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) (gdb) 834 tcp_options_size = tcp_established_options(sk, skb, &opts, (gdb) 836 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); (gdb) 838 if (tcp_packets_in_flight(tp) == 0) { (gdb) 839 tcp_ca_event(sk, CA_EVENT_TX_START); (gdb) 840 skb->ooo_okay = 1; (gdb) 844 skb_push(skb, tcp_header_size); (gdb) p tcp_header_size $2 = 32 (gdb) n 845 skb_reset_transport_header(skb); (gdb) p skb->data+32 $3 = ( unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>... (gdb) n 846 skb_set_owner_w(skb, sk); (gdb) 850 th->source = inet->inet_sport; (gdb) 849 th = tcp_hdr(skb); (gdb) 850 th->source = inet->inet_sport; (gdb) 851 th->dest = inet->inet_dport; (gdb) 852 th->seq = htonl(tcb->seq); (gdb) 853 th->ack_seq = htonl(tp->rcv_nxt); (gdb) 854 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | (gdb) 857 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { (gdb) 863 th->window = htons(tcp_select_window(sk)); (gdb) 865 th->check = 0; (gdb) 866 th->urg_ptr = 0; (gdb) 869 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { (gdb) 879 tcp_options_write((__be32 *)(th + 1), tp, &opts); (gdb) 880 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) (gdb) 881 TCP_ECN_send(sk, skb, tcp_header_size); (gdb) 892 icsk->icsk_af_ops->send_check(sk, skb); (gdb) 894 if (likely(tcb->tcp_flags & TCPHDR_ACK)) (gdb) 895 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); (gdb) 897 if (skb->len != tcp_header_size) (gdb) 898 tcp_event_data_sent(tp, sk); (gdb) 900 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) (gdb) 902 tcp_skb_pcount(skb)); (gdb) 904 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); (gdb) s ip_queue_xmit (skb=0x11778ea8, fl=0x1157e1f0) at net/ipv4/ip_output.c:336 336 { (gdb) n 337 struct sock *sk = skb->sk; (gdb)
内容有点多,当主要有两点需要说明:
1,从堆栈可以看出,函数tcp_write_timer()是从定时器里调入进入的,并非正常的tcp数据发送流程,这从侧面证实了TCP_CORK的确起到了“塞子”的作用。
2,函数tcp_write_timer()会调用到tcp_transmit_skb()函数内再到ip_queue_xmit()函数等(详细调用流程是:tcp_write_timer()-> tcp_probe_timer() -> tcp_send_probe0() -> tcp_write_wakeup() -> tcp_transmit_skb() -> ip_queue_xmit()),将被TCP_CORK塞住的数据发送出去,从skb的数据打印验证了这一点(另外看到这里的tcp头部长度为32字节):
(gdb) tcp_transmit_skb (sk=0x1157e040, skb=0x11778e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809 809 BUG_ON(!skb || !tcp_skb_pcount(skb)); (gdb) p skb->data $1 = ( unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>...
至此,那么前面问题的答案就得到了验证。其实,这也很容易想明白,TCP_CORK的“堵塞”特性无法是为了提高网络利用率,既然反正是要发一个包(窗口探测包),如果有数据待发送,那么干脆就直接发送一个数据包岂不是更好?
如果要测试不设置TCP_CORK的情况如何,那么源代码可改成下面这样(主要关注req_header1数据非堵塞发送流程):
[root@localhost stub]# cat tcp_cork_test.c /** * gcc -Wall -g -o tcp_cork_test tcp_cork_test.c */ #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <errno.h> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <netinet/tcp.h> //char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\nHost: 127.0.0.1\r\nAccept: */*\r\n\r\n"; char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n"; char req_header1[] = "Host: 10.0.0.1\r\nAccept: */*\r\n\r\n"; int main(int argc, char *const *argv) { int sockfd; int on; struct sockaddr_in server_addr; if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) { fprintf(stderr, "Socket error,%s\n", strerror(errno)); return -1; } bzero(&server_addr, sizeof(server_addr)); server_addr.sin_family = AF_INET; server_addr.sin_port = htons(80); if (!inet_aton("10.0.0.1", &server_addr.sin_addr)) { fprintf(stderr, "Bad address:%s\n", strerror(errno)); close(sockfd); return -1; } on = 1; if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno)); close(sockfd); return -1; } if (connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr)) == -1) { fprintf(stderr, "Connect Error:%s\n", strerror(errno)); close(sockfd); return -1; } on = 0; if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno)); close(sockfd); return -1; } on = 1; if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno)); close(sockfd); return -1; } write(sockfd, req_header, strlen(req_header)); getchar(); on = 0; if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK, (const void *)&on, sizeof(int)) == -1) { fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno)); close(sockfd); return -1; } write(sockfd, req_header1, strlen(req_header1)); getchar(); close(sockfd); return 0; } [root@localhost stub]#
对于”Host: 10.0.0.1\r\nAccept: */*\r\n\r\n”数据的发送,通过uml调试如下(直接在函数tcp_transmit_skb()处下断点,被断下来后可以看到它的调用堆栈是直接从tcp_push()函数过来的,即是正常的tcp数据发送流程):
(gdb) c Continuing. Breakpoint 2, tcp_transmit_skb (sk=0x1157e040, skb=0x106e4e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809 809 BUG_ON(!skb || !tcp_skb_pcount(skb)); (gdb) bt 5 #0 tcp_transmit_skb (sk=0x1157e040, skb=0x106e4e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809 #1 0x081de7a8 in tcp_write_xmit (sk=0x1157e040, mss_now=1448, nonagle=0, push_one=0, gfp=32) at net/ipv4/tcp_output.c:1797 #2 0x081de917 in __tcp_push_pending_frames (sk=0x1157e040, cur_mss=1448, nonagle=0) at net/ipv4/tcp_output.c:1835 #3 0x081d3828 in tcp_push (iocb=0x106ade3c, sk=0x1157e040, msg=0x106addf4, size=31) at net/ipv4/tcp.c:569 #4 tcp_sendmsg (iocb=0x106ade3c, sk=0x1157e040, msg=0x106addf4, size=31) at net/ipv4/tcp.c:1116 (More stack frames follow...) (gdb) n 798 { (gdb) n 809 BUG_ON(!skb || !tcp_skb_pcount(skb)); (gdb) n 814 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) (gdb) 815 __net_timestamp(skb); (gdb) 817 if (likely(clone_it)) { (gdb) 818 if (unlikely(skb_cloned(skb))) (gdb) 821 skb = skb_clone(skb, gfp_mask); (gdb) 822 if (unlikely(!skb)) (gdb) 828 tcb = TCP_SKB_CB(skb); (gdb) 829 memset(&opts, 0, sizeof(opts)); (gdb) 828 tcb = TCP_SKB_CB(skb); (gdb) 829 memset(&opts, 0, sizeof(opts)); (gdb) 831 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) (gdb) 834 tcp_options_size = tcp_established_options(sk, skb, &opts, (gdb) 836 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); (gdb) 838 if (tcp_packets_in_flight(tp) == 0) { (gdb) 839 tcp_ca_event(sk, CA_EVENT_TX_START); (gdb) 840 skb->ooo_okay = 1; (gdb) 844 skb_push(skb, tcp_header_size); (gdb) p tcp_header_size $4 = 32 (gdb) p skb->data+32 $5 = (unsigned char *) 0x1157c100 "" (gdb) n 845 skb_reset_transport_header(skb); (gdb) p skb->data+32 $6 = (unsigned char *) 0x1157c0e0 "Host: 10.0.0.1\r\nAccept: */*\r\n\r\n" (gdb)
另外,通过在host里执行“tcpdump -i tap1 -s 0 -w a.pcap”抓包后通过wireshark查看host机器收到的真实数据包,从wireshark分析显示的结果包组来看,与前面的代码验证情况完整一致。
那有没有可能一直堵塞数据而不被内核自动发出去呢?根据个人对linux内核的代码来看,只要满足:在收到上一个包的ACK之前把待发数据包push到协议栈的发送队列&后续应用层不再新下发数据&后续收到ACK后没有触发零窗口探测,那么就有这个可能,不过由于各个linux内核具体实现不一样,我对此(零窗口包的探测,因为堵塞数据是顺着这个目的而一起被发送的)又没有特别深入的研究,利用UML实验验证也比较麻烦(需要满足上面的众多条件),所以我不能肯定(请注意这个,别说我误导新人,我只是根据目前了解来看,估计有这个可能)。嘛啦,留待感兴趣的人自己摸索&验证吧,另外,有本书《TCP/IP架构、设计与应用(Linux版) 》有提到这方面内容,并且欢迎交流。
转载请保留地址:http://lenky.info/archives/2012/08/26/1892 或 http://lenky.info/?p=1892
备注:如无特殊说明,文章内容均出自Lenky个人的真实理解而并非存心妄自揣测来故意愚人耳目。由于个人水平有限,虽力求内容正确无误,但仍然难免出错,请勿见怪,如果可以则请留言告之,并欢迎来讨论。另外值得说明的是,Lenky的部分文章以及部分内容参考借鉴了网络上各位网友的热心分享,特别是一些带有完全参考的文章,其后附带的链接内容也许更直接、更丰富,而我只是做了一下归纳&转述,在此也一并表示感谢。关于本站的所有技术文章,欢迎转载,但请遵从CC创作共享协议,而一些私人性质较强的心情随笔,建议不要转载。
法律:根据最新颁布的《信息网络传播权保护条例》,如果您认为本文章的任何内容侵犯了您的权利,请以或书面等方式告知,本站将及时删除相关内容或链接。