http://blog.sina.com.cn/s/indexlist_1657348185_2.html
IP包的生成和发送接口
====================
(1) Linux内核中有3种基本的IP包生成器, 它们分别为ip_build_xmit(), ip_queue_xmit(),
ip_build_and_send_pkt(). ip_build_and_send_pkt()是一简单的IP包头封装接口,
它接照输入包的路由添加一个IP包头后直接输出,不进行分片处理, 用于tcp_v4_send_synack()中.
ip_send_reply()是基于ip_build_xmit()的一个函数,
用于tcp_v4_send_ack()和tcp_v4_send_reset()中.
(2) ip_build_xmit()使用用户定义的回调函数直接读取用户数据片段生成IP包输出.
如果需要分片,ip_build_xmit()按照最后一个片段到第一个片段的顺序来生成IP包,
这是因为第一个IP包片段的数据区可能包含对整个IP包数据区的校验码,
在回调函数中用户可能会计算输出数据的校验码,
采用从后向前的输出顺序可使校验码自然地写到第一个片段中.
(3) ip_queue_xmit()完成面向连接套接字输出包的路由和IP包头封装. 当套接字处于连接状态时,
所有从套接字发出的包都具有确定的路由, 无需为每一个输出包查询它的目的入口,
可将套接字直接绑定到路由入口上, 这由套接字的目的缓冲指针(dst_cache)来完成.
ip_queue_xmit()首先为输入包建立IP包头, 经过本地包过滤器后,
再将IP包分片输出(ip_fragment), 如果需要的话.
(4) IP包生成器的输出经过本地包过滤器后输入包的路由入口, 对于点播地址来说,
输入到IP输出器中(ip_output); 对于广播或同播地址来说, 输入到IP同播输出器(ip_mc_output).
在IP输出器中, 再经过路由后过滤器,
进入路由的"邻居"入口(dst->neighbour->output)或硬件帧头缓冲入口(dst->hh->hh_output).
邻居是指与主机自已在网络接口设备层次上直达的相邻主机.
邻居负责解析输出包的硬件投送地址, 将包投递给相邻的目的主机或网关主机.
当邻居成功解析包的硬件投送地址时, 将在包的目的入口上创建硬件帧头缓冲结构(dst->hh),
使得后继包可以直接使用组装好的帧头, 直接将包传递给包调度器(dev_queue_xmit).
包调度器按照包的优先级进行重排, 最后将包提交给设备驱动程序发送(dev->hard_start_xmit).
IP包生成接口
------------
; net/ipv4/ip_output.c:
int sysctl_ip_default_ttl = IPDEFTTL; 缺省的IP包生存期为64
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,对包的数据体添加IP头后直接输出
u32 saddr, u32 daddr, struct ip_options *opt)
{
struct rtable *rt = (struct rtable *)skb->dst;
struct iphdr *iph;
if (opt)
iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
else
iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
iph->version = 4;
iph->ihl = 5;
iph->tos = sk->protinfo.af_inet.tos;
iph->frag_off = 0;
if (ip_dont_fragment(sk, &rt->u.dst)) 如果IP包的目的入口禁止分片
iph->frag_off |= htons(IP_DF);
iph->ttl = sk->protinfo.af_inet.ttl; 取套接字协议选项中的生存期
iph->daddr = rt->rt_dst; 取IP包路由的目的地址
iph->saddr = rt->rt_src; 取IP包路由的源地址
iph->protocol = sk->protocol; 取套接字IP协议代码
iph->tot_len = htons(skb->len); IP包总长度
ip_select_ident(iph, &rt->u.dst); 为IP包分配标识号, 禁止分片的IP包标识为零
skb->nh.iph = iph;
if (opt && opt->optlen) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, daddr, rt, 0); 设置IP选项区
}
ip_send_check(iph); 设置IP包头的校验和
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
output_maybe_reroute); 过滤输出并且目的路径可能会被改变
}
int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int), 取数据片段的函数指针
const void *frag, 以上函数的调用参数
unsigned length,
struct ipcm_cookie *ipc, IP包配置信息
struct rtable *rt,
int flags) 从用户数据建立IP包
{
int err;
struct sk_buff *skb;
int df;
struct iphdr *iph;
if (!sk->protinfo.af_inet.hdrincl) { 如果IP包头不由用户创建
length += sizeof(struct iphdr); 取IP包总长
if (length > rt->u.dst.pmtu || ipc->opt != NULL) 如果包长度大于目的入口的最大片断长
return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
} else {
if (length > rt->u.dst.dev->mtu) { 如果包长大于目的入口设备的最大片段长
ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
return -EMSGSIZE;
}
}
if (flags&MSG_PROBE) 测试操作
goto out;
df = 0;
if (ip_dont_fragment(sk, &rt->u.dst)) 如果禁止分片
df = htons(IP_DF);
{
int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
skb = sock_alloc_send_skb(sk, length+hh_len+15,
0, flags&MSG_DONTWAIT, &err); 为套接字分配发送包
if(skb==NULL)
goto error;
skb_reserve(skb, hh_len); 保留硬件帧头空间
}
skb->priority = sk->priority; 取套接字的优先级
skb->dst = dst_clone(&rt->u.dst); 取路由的目的入口
skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
if(!sk->protinfo.af_inet.hdrincl) {
iph->version=4;
iph->ihl=5;
iph->tos=sk->protinfo.af_inet.tos;
iph->tot_len = htons(length);
iph->frag_off = df;
iph->ttl=sk->protinfo.af_inet.mc_ttl;
ip_select_ident(iph, &rt->u.dst);
if (rt->rt_type != RTN_MULTICAST)
iph->ttl=sk->protinfo.af_inet.ttl;
iph->protocol=sk->protocol;
iph->saddr=rt->rt_src;
iph->daddr=rt->rt_dst;
iph->check=0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
; 读取用户一片数据
}
else 如果IP包头由用户创建, 直接将用户数据读入IP头所在位置
err = getfrag(frag, (void *)iph, 0, length);
if (err)
goto error_fault;
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
output_maybe_reroute);
if (err > 0)
err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
if (err)
goto error;
out:
return 0;
error_fault:
err = -EFAULT;
kfree_skb(skb);
error:
IP_INC_STATS(IpOutDiscards);
return err;
}
static int ip_build_xmit_slow(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
struct rtable *rt,
int flags) 建立IP选项区或者分片输出
{
unsigned int fraglen, maxfraglen, fragheaderlen;
int err;
int offset, mf;
int mtu;
u16 id = 0;
int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
int nfrags=0;
struct ip_options *opt = ipc->opt;
int df = 0;
mtu = rt->u.dst.pmtu;
if (ip_dont_fragment(sk, &rt->u.dst))
df = htons(IP_DF);
length -= sizeof(struct iphdr);
if (opt) {
fragheaderlen = sizeof(struct iphdr) + opt->optlen;
maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
} else {
fragheaderlen = sizeof(struct iphdr);
maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
} 求最大IP包长
if (length + fragheaderlen > 0xFFFF) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
return -EMSGSIZE;
}
offset = length - (length % (maxfraglen - fragheaderlen));取最后一个片段的数据偏移量
fraglen = length - offset + fragheaderlen; 求取后一个片段IP包全长
if (length-offset==0) { 如果用户数据恰好是最大单片数据长度的整数倍
fraglen = maxfraglen;
offset -= maxfraglen-fragheaderlen;
}
mf = 0;
if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
return -EMSGSIZE;
}
if (flags&MSG_PROBE)
goto out;
do {
char *data;
struct sk_buff * skb;
skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
if (skb == NULL)
goto error;
skb->priority = sk->priority;
skb->dst = dst_clone(&rt->u.dst);
skb_reserve(skb, hh_len);
data = skb_put(skb, fraglen);
skb->nh.iph = (struct iphdr *)data;
{
struct iphdr *iph = (struct iphdr *)data;
iph->version = 4;
iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt,
ipc->addr, rt, offset);
}
iph->tos = sk->protinfo.af_inet.tos;
iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
iph->frag_off = htons(offset>>3)|mf|df;
iph->id = id;
if (!mf) {
if (offset || !df) {
__ip_select_ident(iph, &rt->u.dst);
id = iph->id;
}
mf = htons(IP_MF);
}
if (rt->rt_type == RTN_MULTICAST)
iph->ttl = sk->protinfo.af_inet.mc_ttl;
else
iph->ttl = sk->protinfo.af_inet.ttl;
iph->protocol = sk->protocol;
iph->check = 0;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
data += iph->ihl*4;
}
if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset -= (maxfraglen-fragheaderlen); 片段从后向前进行分割, 是为了方便TCP包的校验
fraglen = maxfraglen;
nfrags++;
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, output_maybe_reroute);
if (err) {
if (err > 0)
err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
if (err)
goto error;
}
} while (offset >= 0);
if (nfrags>1)
ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
out:
return 0;
error:
IP_INC_STATS(IpOutDiscards);
if (nfrags>1)
ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
return err;
}
void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
unsigned int len)
{
struct {
struct ip_options opt;
char data[40]; 存放IP选项块
} replyopts;
struct ipcm_cookie ipc;
u32 daddr;
struct rtable *rt = (struct rtable*)skb->dst;
if (ip_options_echo(&replyopts.opt, skb)) 将包skb的IP选项刷新到replyopts结构中
return;
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
if (replyopts.opt.optlen) {
ipc.opt = &replyopts.opt;
if (ipc.opt->srr)
daddr = replyopts.opt.faddr;
}
if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
return;
bh_lock_sock(sk);
sk->protinfo.af_inet.tos = skb->nh.iph->tos;
sk->priority = skb->priority;
sk->protocol = skb->nh.iph->protocol;
ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
bh_unlock_sock(sk);
ip_rt_put(rt);
}
struct ip_reply_arg {
struct iovec iov[2];
int n_iov;
u32 csum;
int csumoffset;
};
static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
unsigned int fraglen)
{
struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
u16 *pktp = (u16 *)to;
struct iovec *iov;
int len;
int hdrflag = 1;
iov = &dp->iov[0];
if (offset >= iov->iov_len) {
offset -= iov->iov_len;
iov++;
hdrflag = 0;
}
len = iov->iov_len - offset;
if (fraglen > len) {
dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
dp->csum);
offset = 0;
fraglen -= len;
to += len;
iov++;
}
dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
dp->csum);
if (hdrflag && dp->csumoffset)
*(pktp + dp->csumoffset) = csum_fold(dp->csum);
return 0;
}
int ip_queue_xmit(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct ip_options *opt = sk->protinfo.af_inet.opt;
struct rtable *rt;
struct iphdr *iph;
rt = (struct rtable *)__sk_dst_check(sk, 0); 取套接字所缓冲的发送包的目的路由入口
if (rt == NULL) { 如果尚未缓冲
u32 daddr;
daddr = sk->daddr; 取套接字的对端地址作为目的地址
if(opt && opt->srr) 如果具有信源路由选项
daddr = opt->faddr; 取信源路由的转发地址作为目的地址
if (ip_route_output(&rt, daddr, sk->saddr,
RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
sk->bound_dev_if)) 查询目的地址的路由目的入口
goto no_route;
__sk_dst_set(sk, &rt->u.dst); 将该路由入口缓冲到套接字上
}
skb->dst = dst_clone(&rt->u.dst); 将路由入口绑定到发送包
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto no_route; 如果是指定严格信源路由并且其转发地址不等于网关地址,则操作失败
iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen :0));
*((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
iph->tot_len = htons(skb->len);
iph->frag_off = 0;
iph->ttl = sk->protinfo.af_inet.ttl;
iph->protocol = sk->protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
skb->nh.iph = iph;
if(opt && opt->optlen) { 建立IP选项区
iph->ihl += opt->optlen >> 2;
ip_options_build(skb, opt, sk->daddr, rt, 0);
}
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
ip_queue_xmit2); 过滤输出
no_route:
IP_INC_STATS(IpOutNoRoutes);
kfree_skb(skb);
return -EHOSTUNREACH;
}