使用uml调试tcp socket的TCP_CORK
1,准备这么一个源文件,并编译得到对应的执行程序tcp_cork_test放到uml内:
[root@localhost stub]# cat tcp_cork_test.c
/**
* gcc -Wall -g -o tcp_cork_test tcp_cork_test.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
//char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\nHost: 127.0.0.1\r\nAccept: */*\r\n\r\n";
char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n";
int main(int argc, char *const *argv)
{
int sockfd;
int on;
struct sockaddr_in server_addr;
if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
fprintf(stderr, "Socket error,%s\n", strerror(errno));
return -1;
}
bzero(&server_addr, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
if (!inet_aton("10.0.0.1", &server_addr.sin_addr)) {
fprintf(stderr, "Bad address:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 1;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
if (connect(sockfd, (struct sockaddr *)(&server_addr),
sizeof(struct sockaddr)) == -1) {
fprintf(stderr, "Connect Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 0;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 1;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
write(sockfd, req_header, strlen(req_header));
getchar();
on = 0;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
close(sockfd);
return 0;
}
[root@localhost stub]# gcc -Wall -g -o tcp_cork_test tcp_cork_test.c
[root@localhost stub]# scp -P 22 tcp_cork_test [email protected]:/home/
[email protected]'s password:
tcp_cork_test 100% 14KB 13.6KB/s 00:00
[root@localhost stub]#
2,在host机器里启动对应的socket服务端,比如运行一个nginx,以便uml内的测试程序tcp_cork_test连接过来:
[root@localhost stub]# netstat -natp | grep nginx tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN 879/nginx [root@localhost stub]#
3,根据源文件tcp_cork_test.c代码来看(先把connect()数据立即发送出去,然后堵塞write()数据,函数getchar()调用用于停顿),我这里准备要测试TCP_CORK是否能完全把socket堵塞住,也就是说我只写出“strlen(req_header)”的数据(这个长度明显不到一个MSS),然后一直等待,在这个等待的过程中,看内核(版本为3.4.4)有没有可能会把TCP_CORK“堵塞”的这一小段数据自动发送出去,从而破坏TCP_CORK原本的“堵塞”特性?根据事先的内核代码分析来看,这个问题的答案是有可能的。被TCP_CORK“堵塞”的数据有可能会在需要进行TCP窗口探测时发送出去,具体的入口函数为tcp_write_timer(),所以这里做一下验证。
将uml主进程绑定到gdb(记得做设置:set follow-fork-mode parent),在函数tcp_write_timer()处下断点:
(gdb) set follow-fork-mode parent (gdb) b tcp_write_timer Breakpoint 1 at 0x81df497: file net/ipv4/tcp_timer.c, line 449. (gdb) c Continuing.
然后在uml里(注意:不能远程连接到uml控制台时,那样的话会有很多其它数据包干扰,但通过在host机器里执行./linux…后启动进入的那个控制台无问题,即便host机器是远程接入的)执行tcp_cork_test程序:
[root@localhost home]# ./tcp_cork_test
host里attach到uml的gdb被断了下来:
(gdb) set follow-fork-mode parent
(gdb) b tcp_write_timer
Breakpoint 1 at 0x81df497: file net/ipv4/tcp_timer.c, line 449.
(gdb) c
Continuing.
Detaching after fork from child process 9933.
Breakpoint 1, tcp_write_timer (data=290971712) at net/ipv4/tcp_timer.c:449
449 {
(gdb) bt 4
#0 tcp_write_timer (data=290971712) at net/ipv4/tcp_timer.c:449
#1 0x08076def in call_timer_fn (h=0x82ae8c4) at kernel/timer.c:1115
#2 __run_timers (h=0x82ae8c4) at kernel/timer.c:1176
#3 run_timer_softirq (h=0x82ae8c4) at kernel/timer.c:1364
(More stack frames follow...)
(gdb)
逐一跟踪(由于有一些优化,所以gdb显示的代码有一点点乱序):
(gdb) n
455 if (sock_owned_by_user(sk)) {
(gdb)
461 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
(gdb)
464 if (time_after(icsk->icsk_timeout, jiffies)) {
(gdb)
472 switch (event) {
(gdb)
470 icsk->icsk_pending = 0;
(gdb)
472 switch (event) {
(gdb)
477 tcp_probe_timer(sk);
(gdb) s
tcp_probe_timer (data=290971712) at net/ipv4/tcp_timer.c:274
274 if (tp->packets_out || !tcp_send_head(sk)) {
(gdb) n
296 if (sock_flag(sk, SOCK_DEAD)) {
(gdb)
294 max_probes = sysctl_tcp_retries2;
(gdb)
296 if (sock_flag(sk, SOCK_DEAD)) {
(gdb)
305 if (icsk->icsk_probes_out > max_probes) {
(gdb)
309 tcp_send_probe0(sk);
(gdb) s
tcp_send_probe0 (sk=0x1157e040) at net/ipv4/tcp_output.c:2842
2842 {
(gdb) n
2847 err = tcp_write_wakeup(sk);
(gdb) s
tcp_write_wakeup (sk=0x1157e040) at net/ipv4/tcp_output.c:2796
2796 {
(gdb) n
2800 if (sk->sk_state == TCP_CLOSE)
(gdb)
2803 if ((skb = tcp_send_head(sk)) != NULL &&
(gdb)
2804 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
(gdb)
2803 if ((skb = tcp_send_head(sk)) != NULL &&
(gdb)
2806 unsigned int mss = tcp_current_mss(sk);
(gdb)
2807 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
(gdb)
2809 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
(gdb)
2807 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
(gdb)
2809 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
(gdb)
2816 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
(gdb)
2822 } else if (!tcp_skb_pcount(skb))
(gdb)
2825 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
(gdb)
2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
(gdb) s
2826 TCP_SKB_CB(skb)->when = tcp_time_stamp;
(gdb)
2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
(gdb)
2826 TCP_SKB_CB(skb)->when = tcp_time_stamp;
(gdb)
2827 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
(gdb)
tcp_transmit_skb (sk=0x1157e040, skb=0x11778e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809
809 BUG_ON(!skb || !tcp_skb_pcount(skb));
(gdb) p skb->data
$1 = (
unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>...
(gdb) n
798 {
(gdb)
809 BUG_ON(!skb || !tcp_skb_pcount(skb));
(gdb)
814 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
(gdb)
815 __net_timestamp(skb);
(gdb)
817 if (likely(clone_it)) {
(gdb)
818 if (unlikely(skb_cloned(skb)))
(gdb)
821 skb = skb_clone(skb, gfp_mask);
(gdb)
822 if (unlikely(!skb))
(gdb)
828 tcb = TCP_SKB_CB(skb);
(gdb)
829 memset(&opts, 0, sizeof(opts));
(gdb)
828 tcb = TCP_SKB_CB(skb);
(gdb)
829 memset(&opts, 0, sizeof(opts));
(gdb)
831 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
(gdb)
834 tcp_options_size = tcp_established_options(sk, skb, &opts,
(gdb)
836 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
(gdb)
838 if (tcp_packets_in_flight(tp) == 0) {
(gdb)
839 tcp_ca_event(sk, CA_EVENT_TX_START);
(gdb)
840 skb->ooo_okay = 1;
(gdb)
844 skb_push(skb, tcp_header_size);
(gdb) p tcp_header_size
$2 = 32
(gdb) n
845 skb_reset_transport_header(skb);
(gdb) p skb->data+32
$3 = (
unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>...
(gdb) n
846 skb_set_owner_w(skb, sk);
(gdb)
850 th->source = inet->inet_sport;
(gdb)
849 th = tcp_hdr(skb);
(gdb)
850 th->source = inet->inet_sport;
(gdb)
851 th->dest = inet->inet_dport;
(gdb)
852 th->seq = htonl(tcb->seq);
(gdb)
853 th->ack_seq = htonl(tp->rcv_nxt);
(gdb)
854 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
(gdb)
857 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
(gdb)
863 th->window = htons(tcp_select_window(sk));
(gdb)
865 th->check = 0;
(gdb)
866 th->urg_ptr = 0;
(gdb)
869 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
(gdb)
879 tcp_options_write((__be32 *)(th + 1), tp, &opts);
(gdb)
880 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
(gdb)
881 TCP_ECN_send(sk, skb, tcp_header_size);
(gdb)
892 icsk->icsk_af_ops->send_check(sk, skb);
(gdb)
894 if (likely(tcb->tcp_flags & TCPHDR_ACK))
(gdb)
895 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
(gdb)
897 if (skb->len != tcp_header_size)
(gdb)
898 tcp_event_data_sent(tp, sk);
(gdb)
900 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
(gdb)
902 tcp_skb_pcount(skb));
(gdb)
904 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
(gdb) s
ip_queue_xmit (skb=0x11778ea8, fl=0x1157e1f0) at net/ipv4/ip_output.c:336
336 {
(gdb) n
337 struct sock *sk = skb->sk;
(gdb)
内容有点多,当主要有两点需要说明:
1,从堆栈可以看出,函数tcp_write_timer()是从定时器里调入进入的,并非正常的tcp数据发送流程,这从侧面证实了TCP_CORK的确起到了“塞子”的作用。
2,函数tcp_write_timer()会调用到tcp_transmit_skb()函数内再到ip_queue_xmit()函数等(详细调用流程是:tcp_write_timer()-> tcp_probe_timer() -> tcp_send_probe0() -> tcp_write_wakeup() -> tcp_transmit_skb() -> ip_queue_xmit()),将被TCP_CORK塞住的数据发送出去,从skb的数据打印验证了这一点(另外看到这里的tcp头部长度为32字节):
(gdb)
tcp_transmit_skb (sk=0x1157e040, skb=0x11778e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809
809 BUG_ON(!skb || !tcp_skb_pcount(skb));
(gdb) p skb->data
$1 = (
unsigned char *) 0x115160e0 "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n6\335B.q\352{\031\312\004\304u\214Q[\031\065\201\271?\201!F<2pW8\022C\314\063\215@\340\031y\031\353\302\325\r\202]\263n\223(\256\231a\\\344\352)OB\223\266\372\246n\246\320\025\203\337uX\372_\017\370\366\265\347\207\337 \031\211\223O:\353ds\312\004\312\017(\263\020UJ\244H\325\323X\300\364>\274\026X\v|(j?\242L\340\350??U\271\315\373\236\313\005oDl\367\004\350\270a\306\027\241\327\375\252\071\311(\241\237\221k\374A\023\350a\276\211\361", <incomplete sequence \361>...
至此,那么前面问题的答案就得到了验证。其实,这也很容易想明白,TCP_CORK的“堵塞”特性无法是为了提高网络利用率,既然反正是要发一个包(窗口探测包),如果有数据待发送,那么干脆就直接发送一个数据包岂不是更好?
如果要测试不设置TCP_CORK的情况如何,那么源代码可改成下面这样(主要关注req_header1数据非堵塞发送流程):
[root@localhost stub]# cat tcp_cork_test.c
/**
* gcc -Wall -g -o tcp_cork_test tcp_cork_test.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
//char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\nHost: 127.0.0.1\r\nAccept: */*\r\n\r\n";
char req_header[] = "GET / HTTP/1.1\r\nUser-Agent: curl/7.19.7\r\n";
char req_header1[] = "Host: 10.0.0.1\r\nAccept: */*\r\n\r\n";
int main(int argc, char *const *argv)
{
int sockfd;
int on;
struct sockaddr_in server_addr;
if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
fprintf(stderr, "Socket error,%s\n", strerror(errno));
return -1;
}
bzero(&server_addr, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(80);
if (!inet_aton("10.0.0.1", &server_addr.sin_addr)) {
fprintf(stderr, "Bad address:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 1;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
if (connect(sockfd, (struct sockaddr *)(&server_addr),
sizeof(struct sockaddr)) == -1) {
fprintf(stderr, "Connect Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 0;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_NODELAY) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
on = 1;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
write(sockfd, req_header, strlen(req_header));
getchar();
on = 0;
if (setsockopt(sockfd, IPPROTO_TCP, TCP_CORK,
(const void *)&on, sizeof(int)) == -1) {
fprintf(stderr, "Setsockopt(TCP_CORK) Error:%s\n", strerror(errno));
close(sockfd);
return -1;
}
write(sockfd, req_header1, strlen(req_header1));
getchar();
close(sockfd);
return 0;
}
[root@localhost stub]#
对于”Host: 10.0.0.1\r\nAccept: */*\r\n\r\n”数据的发送,通过uml调试如下(直接在函数tcp_transmit_skb()处下断点,被断下来后可以看到它的调用堆栈是直接从tcp_push()函数过来的,即是正常的tcp数据发送流程):
(gdb) c
Continuing.
Breakpoint 2, tcp_transmit_skb (sk=0x1157e040, skb=0x106e4e00, clone_it=1, gfp_mask=32)
at net/ipv4/tcp_output.c:809
809 BUG_ON(!skb || !tcp_skb_pcount(skb));
(gdb) bt 5
#0 tcp_transmit_skb (sk=0x1157e040, skb=0x106e4e00, clone_it=1, gfp_mask=32) at net/ipv4/tcp_output.c:809
#1 0x081de7a8 in tcp_write_xmit (sk=0x1157e040, mss_now=1448, nonagle=0, push_one=0, gfp=32)
at net/ipv4/tcp_output.c:1797
#2 0x081de917 in __tcp_push_pending_frames (sk=0x1157e040, cur_mss=1448, nonagle=0)
at net/ipv4/tcp_output.c:1835
#3 0x081d3828 in tcp_push (iocb=0x106ade3c, sk=0x1157e040, msg=0x106addf4, size=31) at net/ipv4/tcp.c:569
#4 tcp_sendmsg (iocb=0x106ade3c, sk=0x1157e040, msg=0x106addf4, size=31) at net/ipv4/tcp.c:1116
(More stack frames follow...)
(gdb) n
798 {
(gdb) n
809 BUG_ON(!skb || !tcp_skb_pcount(skb));
(gdb) n
814 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
(gdb)
815 __net_timestamp(skb);
(gdb)
817 if (likely(clone_it)) {
(gdb)
818 if (unlikely(skb_cloned(skb)))
(gdb)
821 skb = skb_clone(skb, gfp_mask);
(gdb)
822 if (unlikely(!skb))
(gdb)
828 tcb = TCP_SKB_CB(skb);
(gdb)
829 memset(&opts, 0, sizeof(opts));
(gdb)
828 tcb = TCP_SKB_CB(skb);
(gdb)
829 memset(&opts, 0, sizeof(opts));
(gdb)
831 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
(gdb)
834 tcp_options_size = tcp_established_options(sk, skb, &opts,
(gdb)
836 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
(gdb)
838 if (tcp_packets_in_flight(tp) == 0) {
(gdb)
839 tcp_ca_event(sk, CA_EVENT_TX_START);
(gdb)
840 skb->ooo_okay = 1;
(gdb)
844 skb_push(skb, tcp_header_size);
(gdb) p tcp_header_size
$4 = 32
(gdb) p skb->data+32
$5 = (unsigned char *) 0x1157c100 ""
(gdb) n
845 skb_reset_transport_header(skb);
(gdb) p skb->data+32
$6 = (unsigned char *) 0x1157c0e0 "Host: 10.0.0.1\r\nAccept: */*\r\n\r\n"
(gdb)
另外,通过在host里执行“tcpdump -i tap1 -s 0 -w a.pcap”抓包后通过wireshark查看host机器收到的真实数据包,从wireshark分析显示的结果包组来看,与前面的代码验证情况完整一致。
那有没有可能一直堵塞数据而不被内核自动发出去呢?根据个人对linux内核的代码来看,只要满足:在收到上一个包的ACK之前把待发数据包push到协议栈的发送队列&后续应用层不再新下发数据&后续收到ACK后没有触发零窗口探测,那么就有这个可能,不过由于各个linux内核具体实现不一样,我对此(零窗口包的探测,因为堵塞数据是顺着这个目的而一起被发送的)又没有特别深入的研究,利用UML实验验证也比较麻烦(需要满足上面的众多条件),所以我不能肯定(请注意这个,别说我误导新人,我只是根据目前了解来看,估计有这个可能)。嘛啦,留待感兴趣的人自己摸索&验证吧,另外,有本书《TCP/IP架构、设计与应用(Linux版) 》有提到这方面内容,并且欢迎交流。
转载请保留地址:http://lenky.info/archives/2012/08/26/1892 或 http://lenky.info/?p=1892
备注:如无特殊说明,文章内容均出自Lenky个人的真实理解而并非存心妄自揣测来故意愚人耳目。由于个人水平有限,虽力求内容正确无误,但仍然难免出错,请勿见怪,如果可以则请留言告之,并欢迎来讨论。另外值得说明的是,Lenky的部分文章以及部分内容参考借鉴了网络上各位网友的热心分享,特别是一些带有完全参考的文章,其后附带的链接内容也许更直接、更丰富,而我只是做了一下归纳&转述,在此也一并表示感谢。关于本站的所有技术文章,欢迎转载,但请遵从CC创作共享协议,而一些私人性质较强的心情随笔,建议不要转载。
法律:根据最新颁布的《信息网络传播权保护条例》,如果您认为本文章的任何内容侵犯了您的权利,请以或书面等方式告知,本站将及时删除相关内容或链接。
创作共享协议. Copyright ©2011-2013