1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	:	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58#include <net/l3mdev.h>
59
60static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61{
62	struct dst_entry *dst = skb_dst(skb);
63	struct net_device *dev = dst->dev;
64	struct neighbour *neigh;
65	struct in6_addr *nexthop;
66	int ret;
67
68	skb->protocol = htons(ETH_P_IPV6);
69	skb->dev = dev;
70
71	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73
74		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75		    ((mroute6_socket(net, skb) &&
76		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78					 &ipv6_hdr(skb)->saddr))) {
79			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80
81			/* Do not check for IFF_ALLMULTI; multicast routing
82			   is not supported in any case.
83			 */
84			if (newskb)
85				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86					net, sk, newskb, NULL, newskb->dev,
87					dev_loopback_xmit);
88
89			if (ipv6_hdr(skb)->hop_limit == 0) {
90				IP6_INC_STATS(net, idev,
91					      IPSTATS_MIB_OUTDISCARDS);
92				kfree_skb(skb);
93				return 0;
94			}
95		}
96
97		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98
99		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100		    IPV6_ADDR_SCOPE_NODELOCAL &&
101		    !(dev->flags & IFF_LOOPBACK)) {
102			kfree_skb(skb);
103			return 0;
104		}
105	}
106
107	rcu_read_lock_bh();
108	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110	if (unlikely(!neigh))
111		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112	if (!IS_ERR(neigh)) {
113		ret = dst_neigh_output(dst, neigh, skb);
114		rcu_read_unlock_bh();
115		return ret;
116	}
117	rcu_read_unlock_bh();
118
119	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
120	kfree_skb(skb);
121	return -EINVAL;
122}
123
124static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
125{
126	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
127	    dst_allfrag(skb_dst(skb)) ||
128	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
129		return ip6_fragment(net, sk, skb, ip6_finish_output2);
130	else
131		return ip6_finish_output2(net, sk, skb);
132}
133
134int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135{
136	struct net_device *dev = skb_dst(skb)->dev;
137	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138
139	if (unlikely(idev->cnf.disable_ipv6)) {
140		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141		kfree_skb(skb);
142		return 0;
143	}
144
145	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146			    net, sk, skb, NULL, dev,
147			    ip6_finish_output,
148			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149}
150
151/*
152 * xmit an sk_buff (used by TCP, SCTP and DCCP)
153 * Note : socket lock is not held for SYNACK packets, but might be modified
154 * by calls to skb_set_owner_w() and ipv6_local_error(),
155 * which are using proper atomic operations or spinlocks.
156 */
157int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158	     struct ipv6_txoptions *opt, int tclass)
159{
160	struct net *net = sock_net(sk);
161	const struct ipv6_pinfo *np = inet6_sk(sk);
162	struct in6_addr *first_hop = &fl6->daddr;
163	struct dst_entry *dst = skb_dst(skb);
164	struct ipv6hdr *hdr;
165	u8  proto = fl6->flowi6_proto;
166	int seg_len = skb->len;
167	int hlimit = -1;
168	u32 mtu;
169
170	if (opt) {
171		unsigned int head_room;
172
173		/* First: exthdrs may take lots of space (~8K for now)
174		   MAX_HEADER is not enough.
175		 */
176		head_room = opt->opt_nflen + opt->opt_flen;
177		seg_len += head_room;
178		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179
180		if (skb_headroom(skb) < head_room) {
181			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182			if (!skb2) {
183				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184					      IPSTATS_MIB_OUTDISCARDS);
185				kfree_skb(skb);
186				return -ENOBUFS;
187			}
188			consume_skb(skb);
189			skb = skb2;
190			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
191			 * it is safe to call in our context (socket lock not held)
192			 */
193			skb_set_owner_w(skb, (struct sock *)sk);
194		}
195		if (opt->opt_flen)
196			ipv6_push_frag_opts(skb, opt, &proto);
197		if (opt->opt_nflen)
198			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199	}
200
201	skb_push(skb, sizeof(struct ipv6hdr));
202	skb_reset_network_header(skb);
203	hdr = ipv6_hdr(skb);
204
205	/*
206	 *	Fill in the IPv6 header
207	 */
208	if (np)
209		hlimit = np->hop_limit;
210	if (hlimit < 0)
211		hlimit = ip6_dst_hoplimit(dst);
212
213	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
214						     np->autoflowlabel, fl6));
215
216	hdr->payload_len = htons(seg_len);
217	hdr->nexthdr = proto;
218	hdr->hop_limit = hlimit;
219
220	hdr->saddr = fl6->saddr;
221	hdr->daddr = *first_hop;
222
223	skb->protocol = htons(ETH_P_IPV6);
224	skb->priority = sk->sk_priority;
225	skb->mark = sk->sk_mark;
226
227	mtu = dst_mtu(dst);
228	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230			      IPSTATS_MIB_OUT, skb->len);
231		/* hooks should never assume socket lock is held.
232		 * we promote our socket to non const
233		 */
234		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
235			       net, (struct sock *)sk, skb, NULL, dst->dev,
236			       dst_output);
237	}
238
239	skb->dev = dst->dev;
240	/* ipv6_local_error() does not require socket lock,
241	 * we promote our socket to non const
242	 */
243	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
244
245	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246	kfree_skb(skb);
247	return -EMSGSIZE;
248}
249EXPORT_SYMBOL(ip6_xmit);
250
251static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
252{
253	struct ip6_ra_chain *ra;
254	struct sock *last = NULL;
255
256	read_lock(&ip6_ra_lock);
257	for (ra = ip6_ra_chain; ra; ra = ra->next) {
258		struct sock *sk = ra->sk;
259		if (sk && ra->sel == sel &&
260		    (!sk->sk_bound_dev_if ||
261		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
262			if (last) {
263				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
264				if (skb2)
265					rawv6_rcv(last, skb2);
266			}
267			last = sk;
268		}
269	}
270
271	if (last) {
272		rawv6_rcv(last, skb);
273		read_unlock(&ip6_ra_lock);
274		return 1;
275	}
276	read_unlock(&ip6_ra_lock);
277	return 0;
278}
279
280static int ip6_forward_proxy_check(struct sk_buff *skb)
281{
282	struct ipv6hdr *hdr = ipv6_hdr(skb);
283	u8 nexthdr = hdr->nexthdr;
284	__be16 frag_off;
285	int offset;
286
287	if (ipv6_ext_hdr(nexthdr)) {
288		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
289		if (offset < 0)
290			return 0;
291	} else
292		offset = sizeof(struct ipv6hdr);
293
294	if (nexthdr == IPPROTO_ICMPV6) {
295		struct icmp6hdr *icmp6;
296
297		if (!pskb_may_pull(skb, (skb_network_header(skb) +
298					 offset + 1 - skb->data)))
299			return 0;
300
301		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
302
303		switch (icmp6->icmp6_type) {
304		case NDISC_ROUTER_SOLICITATION:
305		case NDISC_ROUTER_ADVERTISEMENT:
306		case NDISC_NEIGHBOUR_SOLICITATION:
307		case NDISC_NEIGHBOUR_ADVERTISEMENT:
308		case NDISC_REDIRECT:
309			/* For reaction involving unicast neighbor discovery
310			 * message destined to the proxied address, pass it to
311			 * input function.
312			 */
313			return 1;
314		default:
315			break;
316		}
317	}
318
319	/*
320	 * The proxying router can't forward traffic sent to a link-local
321	 * address, so signal the sender and discard the packet. This
322	 * behavior is clarified by the MIPv6 specification.
323	 */
324	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
325		dst_link_failure(skb);
326		return -1;
327	}
328
329	return 0;
330}
331
332static inline int ip6_forward_finish(struct net *net, struct sock *sk,
333				     struct sk_buff *skb)
334{
335	skb_sender_cpu_clear(skb);
336	return dst_output(net, sk, skb);
337}
338
339static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
340{
341	unsigned int mtu;
342	struct inet6_dev *idev;
343
344	if (dst_metric_locked(dst, RTAX_MTU)) {
345		mtu = dst_metric_raw(dst, RTAX_MTU);
346		if (mtu)
347			return mtu;
348	}
349
350	mtu = IPV6_MIN_MTU;
351	rcu_read_lock();
352	idev = __in6_dev_get(dst->dev);
353	if (idev)
354		mtu = idev->cnf.mtu6;
355	rcu_read_unlock();
356
357	return mtu;
358}
359
360static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
361{
362	if (skb->len <= mtu)
363		return false;
364
365	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
366	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
367		return true;
368
369	if (skb->ignore_df)
370		return false;
371
372	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
373		return false;
374
375	return true;
376}
377
378int ip6_forward(struct sk_buff *skb)
379{
380	struct dst_entry *dst = skb_dst(skb);
381	struct ipv6hdr *hdr = ipv6_hdr(skb);
382	struct inet6_skb_parm *opt = IP6CB(skb);
383	struct net *net = dev_net(dst->dev);
384	u32 mtu;
385
386	if (net->ipv6.devconf_all->forwarding == 0)
387		goto error;
388
389	if (skb->pkt_type != PACKET_HOST)
390		goto drop;
391
392	if (unlikely(skb->sk))
393		goto drop;
394
395	if (skb_warn_if_lro(skb))
396		goto drop;
397
398	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
400				 IPSTATS_MIB_INDISCARDS);
401		goto drop;
402	}
403
404	skb_forward_csum(skb);
405
406	/*
407	 *	We DO NOT make any processing on
408	 *	RA packets, pushing them to user level AS IS
409	 *	without ane WARRANTY that application will be able
410	 *	to interpret them. The reason is that we
411	 *	cannot make anything clever here.
412	 *
413	 *	We are not end-node, so that if packet contains
414	 *	AH/ESP, we cannot make anything.
415	 *	Defragmentation also would be mistake, RA packets
416	 *	cannot be fragmented, because there is no warranty
417	 *	that different fragments will go along one path. --ANK
418	 */
419	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
420		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
421			return 0;
422	}
423
424	/*
425	 *	check and decrement ttl
426	 */
427	if (hdr->hop_limit <= 1) {
428		/* Force OUTPUT device used as source address */
429		skb->dev = dst->dev;
430		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
431		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
432				 IPSTATS_MIB_INHDRERRORS);
433
434		kfree_skb(skb);
435		return -ETIMEDOUT;
436	}
437
438	/* XXX: idev->cnf.proxy_ndp? */
439	if (net->ipv6.devconf_all->proxy_ndp &&
440	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
441		int proxied = ip6_forward_proxy_check(skb);
442		if (proxied > 0)
443			return ip6_input(skb);
444		else if (proxied < 0) {
445			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
446					 IPSTATS_MIB_INDISCARDS);
447			goto drop;
448		}
449	}
450
451	if (!xfrm6_route_forward(skb)) {
452		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
453				 IPSTATS_MIB_INDISCARDS);
454		goto drop;
455	}
456	dst = skb_dst(skb);
457
458	/* IPv6 specs say nothing about it, but it is clear that we cannot
459	   send redirects to source routed frames.
460	   We don't send redirects to frames decapsulated from IPsec.
461	 */
462	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
463		struct in6_addr *target = NULL;
464		struct inet_peer *peer;
465		struct rt6_info *rt;
466
467		/*
468		 *	incoming and outgoing devices are the same
469		 *	send a redirect.
470		 */
471
472		rt = (struct rt6_info *) dst;
473		if (rt->rt6i_flags & RTF_GATEWAY)
474			target = &rt->rt6i_gateway;
475		else
476			target = &hdr->daddr;
477
478		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
479
480		/* Limit redirects both by destination (here)
481		   and by source (inside ndisc_send_redirect)
482		 */
483		if (inet_peer_xrlim_allow(peer, 1*HZ))
484			ndisc_send_redirect(skb, target);
485		if (peer)
486			inet_putpeer(peer);
487	} else {
488		int addrtype = ipv6_addr_type(&hdr->saddr);
489
490		/* This check is security critical. */
491		if (addrtype == IPV6_ADDR_ANY ||
492		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493			goto error;
494		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496				    ICMPV6_NOT_NEIGHBOUR, 0);
497			goto error;
498		}
499	}
500
501	mtu = ip6_dst_mtu_forward(dst);
502	if (mtu < IPV6_MIN_MTU)
503		mtu = IPV6_MIN_MTU;
504
505	if (ip6_pkt_too_big(skb, mtu)) {
506		/* Again, force OUTPUT device used as source address */
507		skb->dev = dst->dev;
508		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
510				 IPSTATS_MIB_INTOOBIGERRORS);
511		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
512				 IPSTATS_MIB_FRAGFAILS);
513		kfree_skb(skb);
514		return -EMSGSIZE;
515	}
516
517	if (skb_cow(skb, dst->dev->hard_header_len)) {
518		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
519				 IPSTATS_MIB_OUTDISCARDS);
520		goto drop;
521	}
522
523	hdr = ipv6_hdr(skb);
524
525	/* Mangling hops number delayed to point after skb COW */
526
527	hdr->hop_limit--;
528
529	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
531	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
532		       net, NULL, skb, skb->dev, dst->dev,
533		       ip6_forward_finish);
534
535error:
536	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537drop:
538	kfree_skb(skb);
539	return -EINVAL;
540}
541
542static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543{
544	to->pkt_type = from->pkt_type;
545	to->priority = from->priority;
546	to->protocol = from->protocol;
547	skb_dst_drop(to);
548	skb_dst_set(to, dst_clone(skb_dst(from)));
549	to->dev = from->dev;
550	to->mark = from->mark;
551
552#ifdef CONFIG_NET_SCHED
553	to->tc_index = from->tc_index;
554#endif
555	nf_copy(to, from);
556	skb_copy_secmark(to, from);
557}
558
559int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
560		 int (*output)(struct net *, struct sock *, struct sk_buff *))
561{
562	struct sk_buff *frag;
563	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
564	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
565				inet6_sk(skb->sk) : NULL;
566	struct ipv6hdr *tmp_hdr;
567	struct frag_hdr *fh;
568	unsigned int mtu, hlen, left, len;
569	int hroom, troom;
570	__be32 frag_id;
571	int ptr, offset = 0, err = 0;
572	u8 *prevhdr, nexthdr = 0;
573
574	hlen = ip6_find_1stfragopt(skb, &prevhdr);
575	nexthdr = *prevhdr;
576
577	mtu = ip6_skb_dst_mtu(skb);
578
579	/* We must not fragment if the socket is set to force MTU discovery
580	 * or if the skb it not generated by a local socket.
581	 */
582	if (unlikely(!skb->ignore_df && skb->len > mtu))
583		goto fail_toobig;
584
585	if (IP6CB(skb)->frag_max_size) {
586		if (IP6CB(skb)->frag_max_size > mtu)
587			goto fail_toobig;
588
589		/* don't send fragments larger than what we received */
590		mtu = IP6CB(skb)->frag_max_size;
591		if (mtu < IPV6_MIN_MTU)
592			mtu = IPV6_MIN_MTU;
593	}
594
595	if (np && np->frag_size < mtu) {
596		if (np->frag_size)
597			mtu = np->frag_size;
598	}
599	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
600		goto fail_toobig;
601	mtu -= hlen + sizeof(struct frag_hdr);
602
603	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
604				    &ipv6_hdr(skb)->saddr);
605
606	if (skb->ip_summed == CHECKSUM_PARTIAL &&
607	    (err = skb_checksum_help(skb)))
608		goto fail;
609
610	hroom = LL_RESERVED_SPACE(rt->dst.dev);
611	if (skb_has_frag_list(skb)) {
612		int first_len = skb_pagelen(skb);
613		struct sk_buff *frag2;
614
615		if (first_len - hlen > mtu ||
616		    ((first_len - hlen) & 7) ||
617		    skb_cloned(skb) ||
618		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
619			goto slow_path;
620
621		skb_walk_frags(skb, frag) {
622			/* Correct geometry. */
623			if (frag->len > mtu ||
624			    ((frag->len & 7) && frag->next) ||
625			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
626				goto slow_path_clean;
627
628			/* Partially cloned skb? */
629			if (skb_shared(frag))
630				goto slow_path_clean;
631
632			BUG_ON(frag->sk);
633			if (skb->sk) {
634				frag->sk = skb->sk;
635				frag->destructor = sock_wfree;
636			}
637			skb->truesize -= frag->truesize;
638		}
639
640		err = 0;
641		offset = 0;
642		/* BUILD HEADER */
643
644		*prevhdr = NEXTHDR_FRAGMENT;
645		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
646		if (!tmp_hdr) {
647			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
648				      IPSTATS_MIB_FRAGFAILS);
649			err = -ENOMEM;
650			goto fail;
651		}
652		frag = skb_shinfo(skb)->frag_list;
653		skb_frag_list_init(skb);
654
655		__skb_pull(skb, hlen);
656		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
657		__skb_push(skb, hlen);
658		skb_reset_network_header(skb);
659		memcpy(skb_network_header(skb), tmp_hdr, hlen);
660
661		fh->nexthdr = nexthdr;
662		fh->reserved = 0;
663		fh->frag_off = htons(IP6_MF);
664		fh->identification = frag_id;
665
666		first_len = skb_pagelen(skb);
667		skb->data_len = first_len - skb_headlen(skb);
668		skb->len = first_len;
669		ipv6_hdr(skb)->payload_len = htons(first_len -
670						   sizeof(struct ipv6hdr));
671
672		dst_hold(&rt->dst);
673
674		for (;;) {
675			/* Prepare header of the next frame,
676			 * before previous one went down. */
677			if (frag) {
678				frag->ip_summed = CHECKSUM_NONE;
679				skb_reset_transport_header(frag);
680				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
681				__skb_push(frag, hlen);
682				skb_reset_network_header(frag);
683				memcpy(skb_network_header(frag), tmp_hdr,
684				       hlen);
685				offset += skb->len - hlen - sizeof(struct frag_hdr);
686				fh->nexthdr = nexthdr;
687				fh->reserved = 0;
688				fh->frag_off = htons(offset);
689				if (frag->next)
690					fh->frag_off |= htons(IP6_MF);
691				fh->identification = frag_id;
692				ipv6_hdr(frag)->payload_len =
693						htons(frag->len -
694						      sizeof(struct ipv6hdr));
695				ip6_copy_metadata(frag, skb);
696			}
697
698			err = output(net, sk, skb);
699			if (!err)
700				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
701					      IPSTATS_MIB_FRAGCREATES);
702
703			if (err || !frag)
704				break;
705
706			skb = frag;
707			frag = skb->next;
708			skb->next = NULL;
709		}
710
711		kfree(tmp_hdr);
712
713		if (err == 0) {
714			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
715				      IPSTATS_MIB_FRAGOKS);
716			ip6_rt_put(rt);
717			return 0;
718		}
719
720		kfree_skb_list(frag);
721
722		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
723			      IPSTATS_MIB_FRAGFAILS);
724		ip6_rt_put(rt);
725		return err;
726
727slow_path_clean:
728		skb_walk_frags(skb, frag2) {
729			if (frag2 == frag)
730				break;
731			frag2->sk = NULL;
732			frag2->destructor = NULL;
733			skb->truesize += frag2->truesize;
734		}
735	}
736
737slow_path:
738	left = skb->len - hlen;		/* Space per frame */
739	ptr = hlen;			/* Where to start from */
740
741	/*
742	 *	Fragment the datagram.
743	 */
744
745	*prevhdr = NEXTHDR_FRAGMENT;
746	troom = rt->dst.dev->needed_tailroom;
747
748	/*
749	 *	Keep copying data until we run out.
750	 */
751	while (left > 0)	{
752		len = left;
753		/* IF: it doesn't fit, use 'mtu' - the data space left */
754		if (len > mtu)
755			len = mtu;
756		/* IF: we are not sending up to and including the packet end
757		   then align the next start on an eight byte boundary */
758		if (len < left)	{
759			len &= ~7;
760		}
761
762		/* Allocate buffer */
763		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
764				 hroom + troom, GFP_ATOMIC);
765		if (!frag) {
766			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
767				      IPSTATS_MIB_FRAGFAILS);
768			err = -ENOMEM;
769			goto fail;
770		}
771
772		/*
773		 *	Set up data on packet
774		 */
775
776		ip6_copy_metadata(frag, skb);
777		skb_reserve(frag, hroom);
778		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
779		skb_reset_network_header(frag);
780		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
781		frag->transport_header = (frag->network_header + hlen +
782					  sizeof(struct frag_hdr));
783
784		/*
785		 *	Charge the memory for the fragment to any owner
786		 *	it might possess
787		 */
788		if (skb->sk)
789			skb_set_owner_w(frag, skb->sk);
790
791		/*
792		 *	Copy the packet header into the new buffer.
793		 */
794		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
795
796		/*
797		 *	Build fragment header.
798		 */
799		fh->nexthdr = nexthdr;
800		fh->reserved = 0;
801		fh->identification = frag_id;
802
803		/*
804		 *	Copy a block of the IP datagram.
805		 */
806		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
807				     len));
808		left -= len;
809
810		fh->frag_off = htons(offset);
811		if (left > 0)
812			fh->frag_off |= htons(IP6_MF);
813		ipv6_hdr(frag)->payload_len = htons(frag->len -
814						    sizeof(struct ipv6hdr));
815
816		ptr += len;
817		offset += len;
818
819		/*
820		 *	Put this fragment into the sending queue.
821		 */
822		err = output(net, sk, frag);
823		if (err)
824			goto fail;
825
826		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
827			      IPSTATS_MIB_FRAGCREATES);
828	}
829	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830		      IPSTATS_MIB_FRAGOKS);
831	consume_skb(skb);
832	return err;
833
834fail_toobig:
835	if (skb->sk && dst_allfrag(skb_dst(skb)))
836		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
837
838	skb->dev = skb_dst(skb)->dev;
839	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
840	err = -EMSGSIZE;
841
842fail:
843	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
844		      IPSTATS_MIB_FRAGFAILS);
845	kfree_skb(skb);
846	return err;
847}
848
849static inline int ip6_rt_check(const struct rt6key *rt_key,
850			       const struct in6_addr *fl_addr,
851			       const struct in6_addr *addr_cache)
852{
853	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
854		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
855}
856
857static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
858					  struct dst_entry *dst,
859					  const struct flowi6 *fl6)
860{
861	struct ipv6_pinfo *np = inet6_sk(sk);
862	struct rt6_info *rt;
863
864	if (!dst)
865		goto out;
866
867	if (dst->ops->family != AF_INET6) {
868		dst_release(dst);
869		return NULL;
870	}
871
872	rt = (struct rt6_info *)dst;
873	/* Yes, checking route validity in not connected
874	 * case is not very simple. Take into account,
875	 * that we do not support routing by source, TOS,
876	 * and MSG_DONTROUTE		--ANK (980726)
877	 *
878	 * 1. ip6_rt_check(): If route was host route,
879	 *    check that cached destination is current.
880	 *    If it is network route, we still may
881	 *    check its validity using saved pointer
882	 *    to the last used address: daddr_cache.
883	 *    We do not want to save whole address now,
884	 *    (because main consumer of this service
885	 *    is tcp, which has not this problem),
886	 *    so that the last trick works only on connected
887	 *    sockets.
888	 * 2. oif also should be the same.
889	 */
890	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
891#ifdef CONFIG_IPV6_SUBTREES
892	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
893#endif
894	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
895	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
896		dst_release(dst);
897		dst = NULL;
898	}
899
900out:
901	return dst;
902}
903
904static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
905			       struct dst_entry **dst, struct flowi6 *fl6)
906{
907#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
908	struct neighbour *n;
909	struct rt6_info *rt;
910#endif
911	int err;
912	int flags = 0;
913
914	/* The correct way to handle this would be to do
915	 * ip6_route_get_saddr, and then ip6_route_output; however,
916	 * the route-specific preferred source forces the
917	 * ip6_route_output call _before_ ip6_route_get_saddr.
918	 *
919	 * In source specific routing (no src=any default route),
920	 * ip6_route_output will fail given src=any saddr, though, so
921	 * that's why we try it again later.
922	 */
923	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
924		struct rt6_info *rt;
925		bool had_dst = *dst != NULL;
926
927		if (!had_dst)
928			*dst = ip6_route_output(net, sk, fl6);
929		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
930		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
931					  sk ? inet6_sk(sk)->srcprefs : 0,
932					  &fl6->saddr);
933		if (err)
934			goto out_err_release;
935
936		/* If we had an erroneous initial result, pretend it
937		 * never existed and let the SA-enabled version take
938		 * over.
939		 */
940		if (!had_dst && (*dst)->error) {
941			dst_release(*dst);
942			*dst = NULL;
943		}
944
945		if (fl6->flowi6_oif)
946			flags |= RT6_LOOKUP_F_IFACE;
947	}
948
949	if (!*dst)
950		*dst = ip6_route_output_flags(net, sk, fl6, flags);
951
952	err = (*dst)->error;
953	if (err)
954		goto out_err_release;
955
956#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957	/*
958	 * Here if the dst entry we've looked up
959	 * has a neighbour entry that is in the INCOMPLETE
960	 * state and the src address from the flow is
961	 * marked as OPTIMISTIC, we release the found
962	 * dst entry and replace it instead with the
963	 * dst entry of the nexthop router
964	 */
965	rt = (struct rt6_info *) *dst;
966	rcu_read_lock_bh();
967	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
968				      rt6_nexthop(rt, &fl6->daddr));
969	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
970	rcu_read_unlock_bh();
971
972	if (err) {
973		struct inet6_ifaddr *ifp;
974		struct flowi6 fl_gw6;
975		int redirect;
976
977		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
978				      (*dst)->dev, 1);
979
980		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
981		if (ifp)
982			in6_ifa_put(ifp);
983
984		if (redirect) {
985			/*
986			 * We need to get the dst entry for the
987			 * default router instead
988			 */
989			dst_release(*dst);
990			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
991			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
992			*dst = ip6_route_output(net, sk, &fl_gw6);
993			err = (*dst)->error;
994			if (err)
995				goto out_err_release;
996		}
997	}
998#endif
999
1000	return 0;
1001
1002out_err_release:
1003	if (err == -ENETUNREACH)
1004		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1005	dst_release(*dst);
1006	*dst = NULL;
1007	return err;
1008}
1009
1010/**
1011 *	ip6_dst_lookup - perform route lookup on flow
1012 *	@sk: socket which provides route info
1013 *	@dst: pointer to dst_entry * for result
1014 *	@fl6: flow to lookup
1015 *
1016 *	This function performs a route lookup on the given flow.
1017 *
1018 *	It returns zero on success, or a standard errno code on error.
1019 */
1020int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1021		   struct flowi6 *fl6)
1022{
1023	*dst = NULL;
1024	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1025}
1026EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1027
1028/**
1029 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1030 *	@sk: socket which provides route info
1031 *	@fl6: flow to lookup
1032 *	@final_dst: final destination address for ipsec lookup
1033 *
1034 *	This function performs a route lookup on the given flow.
1035 *
1036 *	It returns a valid dst pointer on success, or a pointer encoded
1037 *	error code.
1038 */
1039struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1040				      const struct in6_addr *final_dst)
1041{
1042	struct dst_entry *dst = NULL;
1043	int err;
1044
1045	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1046	if (err)
1047		return ERR_PTR(err);
1048	if (final_dst)
1049		fl6->daddr = *final_dst;
1050	if (!fl6->flowi6_oif)
1051		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1052
1053	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1054}
1055EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1056
1057/**
1058 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1059 *	@sk: socket which provides the dst cache and route info
1060 *	@fl6: flow to lookup
1061 *	@final_dst: final destination address for ipsec lookup
1062 *
1063 *	This function performs a route lookup on the given flow with the
1064 *	possibility of using the cached route in the socket if it is valid.
1065 *	It will take the socket dst lock when operating on the dst cache.
1066 *	As a result, this function can only be used in process context.
1067 *
1068 *	It returns a valid dst pointer on success, or a pointer encoded
1069 *	error code.
1070 */
1071struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1072					 const struct in6_addr *final_dst)
1073{
1074	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1075
1076	dst = ip6_sk_dst_check(sk, dst, fl6);
1077	if (!dst)
1078		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1079
1080	return dst;
1081}
1082EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1083
1084static inline int ip6_ufo_append_data(struct sock *sk,
1085			struct sk_buff_head *queue,
1086			int getfrag(void *from, char *to, int offset, int len,
1087			int odd, struct sk_buff *skb),
1088			void *from, int length, int hh_len, int fragheaderlen,
1089			int exthdrlen, int transhdrlen, int mtu,
1090			unsigned int flags, const struct flowi6 *fl6)
1091
1092{
1093	struct sk_buff *skb;
1094	int err;
1095
1096	/* There is support for UDP large send offload by network
1097	 * device, so create one single skb packet containing complete
1098	 * udp datagram
1099	 */
1100	skb = skb_peek_tail(queue);
1101	if (!skb) {
1102		skb = sock_alloc_send_skb(sk,
1103			hh_len + fragheaderlen + transhdrlen + 20,
1104			(flags & MSG_DONTWAIT), &err);
1105		if (!skb)
1106			return err;
1107
1108		/* reserve space for Hardware header */
1109		skb_reserve(skb, hh_len);
1110
1111		/* create space for UDP/IP header */
1112		skb_put(skb, fragheaderlen + transhdrlen);
1113
1114		/* initialize network header pointer */
1115		skb_set_network_header(skb, exthdrlen);
1116
1117		/* initialize protocol header pointer */
1118		skb->transport_header = skb->network_header + fragheaderlen;
1119
1120		skb->protocol = htons(ETH_P_IPV6);
1121		skb->csum = 0;
1122
1123		__skb_queue_tail(queue, skb);
1124	} else if (skb_is_gso(skb)) {
1125		goto append;
1126	}
1127
1128	skb->ip_summed = CHECKSUM_PARTIAL;
1129	/* Specify the length of each IPv6 datagram fragment.
1130	 * It has to be a multiple of 8.
1131	 */
1132	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1133				     sizeof(struct frag_hdr)) & ~7;
1134	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1135	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1136							 &fl6->daddr,
1137							 &fl6->saddr);
1138
1139append:
1140	return skb_append_datato_frags(sk, skb, getfrag, from,
1141				       (length - transhdrlen));
1142}
1143
1144static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1145					       gfp_t gfp)
1146{
1147	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1148}
1149
1150static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1151						gfp_t gfp)
1152{
1153	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1154}
1155
1156static void ip6_append_data_mtu(unsigned int *mtu,
1157				int *maxfraglen,
1158				unsigned int fragheaderlen,
1159				struct sk_buff *skb,
1160				struct rt6_info *rt,
1161				unsigned int orig_mtu)
1162{
1163	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1164		if (!skb) {
1165			/* first fragment, reserve header_len */
1166			*mtu = orig_mtu - rt->dst.header_len;
1167
1168		} else {
1169			/*
1170			 * this fragment is not first, the headers
1171			 * space is regarded as data space.
1172			 */
1173			*mtu = orig_mtu;
1174		}
1175		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1176			      + fragheaderlen - sizeof(struct frag_hdr);
1177	}
1178}
1179
1180static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1181			  struct inet6_cork *v6_cork,
1182			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1183			  struct rt6_info *rt, struct flowi6 *fl6)
1184{
1185	struct ipv6_pinfo *np = inet6_sk(sk);
1186	unsigned int mtu;
1187
1188	/*
1189	 * setup for corking
1190	 */
1191	if (opt) {
1192		if (WARN_ON(v6_cork->opt))
1193			return -EINVAL;
1194
1195		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1196		if (unlikely(!v6_cork->opt))
1197			return -ENOBUFS;
1198
1199		v6_cork->opt->tot_len = opt->tot_len;
1200		v6_cork->opt->opt_flen = opt->opt_flen;
1201		v6_cork->opt->opt_nflen = opt->opt_nflen;
1202
1203		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1204						    sk->sk_allocation);
1205		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1206			return -ENOBUFS;
1207
1208		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1209						    sk->sk_allocation);
1210		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1211			return -ENOBUFS;
1212
1213		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1214						   sk->sk_allocation);
1215		if (opt->hopopt && !v6_cork->opt->hopopt)
1216			return -ENOBUFS;
1217
1218		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1219						    sk->sk_allocation);
1220		if (opt->srcrt && !v6_cork->opt->srcrt)
1221			return -ENOBUFS;
1222
1223		/* need source address above miyazawa*/
1224	}
1225	dst_hold(&rt->dst);
1226	cork->base.dst = &rt->dst;
1227	cork->fl.u.ip6 = *fl6;
1228	v6_cork->hop_limit = hlimit;
1229	v6_cork->tclass = tclass;
1230	if (rt->dst.flags & DST_XFRM_TUNNEL)
1231		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1232		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1233	else
1234		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1235		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1236	if (np->frag_size < mtu) {
1237		if (np->frag_size)
1238			mtu = np->frag_size;
1239	}
1240	cork->base.fragsize = mtu;
1241	if (dst_allfrag(rt->dst.path))
1242		cork->base.flags |= IPCORK_ALLFRAG;
1243	cork->base.length = 0;
1244
1245	return 0;
1246}
1247
1248static int __ip6_append_data(struct sock *sk,
1249			     struct flowi6 *fl6,
1250			     struct sk_buff_head *queue,
1251			     struct inet_cork *cork,
1252			     struct inet6_cork *v6_cork,
1253			     struct page_frag *pfrag,
1254			     int getfrag(void *from, char *to, int offset,
1255					 int len, int odd, struct sk_buff *skb),
1256			     void *from, int length, int transhdrlen,
1257			     unsigned int flags, int dontfrag)
1258{
1259	struct sk_buff *skb, *skb_prev = NULL;
1260	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1261	int exthdrlen = 0;
1262	int dst_exthdrlen = 0;
1263	int hh_len;
1264	int copy;
1265	int err;
1266	int offset = 0;
1267	__u8 tx_flags = 0;
1268	u32 tskey = 0;
1269	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1270	struct ipv6_txoptions *opt = v6_cork->opt;
1271	int csummode = CHECKSUM_NONE;
1272	unsigned int maxnonfragsize, headersize;
1273
1274	skb = skb_peek_tail(queue);
1275	if (!skb) {
1276		exthdrlen = opt ? opt->opt_flen : 0;
1277		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1278	}
1279
1280	mtu = cork->fragsize;
1281	orig_mtu = mtu;
1282
1283	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1284
1285	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1286			(opt ? opt->opt_nflen : 0);
1287	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1288		     sizeof(struct frag_hdr);
1289
1290	headersize = sizeof(struct ipv6hdr) +
1291		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1292		     (dst_allfrag(&rt->dst) ?
1293		      sizeof(struct frag_hdr) : 0) +
1294		     rt->rt6i_nfheader_len;
1295
1296	if (cork->length + length > mtu - headersize && dontfrag &&
1297	    (sk->sk_protocol == IPPROTO_UDP ||
1298	     sk->sk_protocol == IPPROTO_RAW)) {
1299		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300				sizeof(struct ipv6hdr));
1301		goto emsgsize;
1302	}
1303
1304	if (ip6_sk_ignore_df(sk))
1305		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306	else
1307		maxnonfragsize = mtu;
1308
1309	if (cork->length + length > maxnonfragsize - headersize) {
1310emsgsize:
1311		ipv6_local_error(sk, EMSGSIZE, fl6,
1312				 mtu - headersize +
1313				 sizeof(struct ipv6hdr));
1314		return -EMSGSIZE;
1315	}
1316
1317	/* CHECKSUM_PARTIAL only with no extension headers and when
1318	 * we are not going to fragment
1319	 */
1320	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321	    headersize == sizeof(struct ipv6hdr) &&
1322	    length < mtu - headersize &&
1323	    !(flags & MSG_MORE) &&
1324	    rt->dst.dev->features & NETIF_F_V6_CSUM)
1325		csummode = CHECKSUM_PARTIAL;
1326
1327	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1328		sock_tx_timestamp(sk, &tx_flags);
1329		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1330		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1331			tskey = sk->sk_tskey++;
1332	}
1333
1334	/*
1335	 * Let's try using as much space as possible.
1336	 * Use MTU if total length of the message fits into the MTU.
1337	 * Otherwise, we need to reserve fragment header and
1338	 * fragment alignment (= 8-15 octects, in total).
1339	 *
1340	 * Note that we may need to "move" the data from the tail of
1341	 * of the buffer to the new fragment when we split
1342	 * the message.
1343	 *
1344	 * FIXME: It may be fragmented into multiple chunks
1345	 *        at once if non-fragmentable extension headers
1346	 *        are too large.
1347	 * --yoshfuji
1348	 */
1349
1350	cork->length += length;
1351	if (((length > mtu) ||
1352	     (skb && skb_is_gso(skb))) &&
1353	    (sk->sk_protocol == IPPROTO_UDP) &&
1354	    (rt->dst.dev->features & NETIF_F_UFO) &&
1355	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1356		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1357					  hh_len, fragheaderlen, exthdrlen,
1358					  transhdrlen, mtu, flags, fl6);
1359		if (err)
1360			goto error;
1361		return 0;
1362	}
1363
1364	if (!skb)
1365		goto alloc_new_skb;
1366
1367	while (length > 0) {
1368		/* Check if the remaining data fits into current packet. */
1369		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370		if (copy < length)
1371			copy = maxfraglen - skb->len;
1372
1373		if (copy <= 0) {
1374			char *data;
1375			unsigned int datalen;
1376			unsigned int fraglen;
1377			unsigned int fraggap;
1378			unsigned int alloclen;
1379alloc_new_skb:
1380			/* There's no room in the current skb */
1381			if (skb)
1382				fraggap = skb->len - maxfraglen;
1383			else
1384				fraggap = 0;
1385			/* update mtu and maxfraglen if necessary */
1386			if (!skb || !skb_prev)
1387				ip6_append_data_mtu(&mtu, &maxfraglen,
1388						    fragheaderlen, skb, rt,
1389						    orig_mtu);
1390
1391			skb_prev = skb;
1392
1393			/*
1394			 * If remaining data exceeds the mtu,
1395			 * we know we need more fragment(s).
1396			 */
1397			datalen = length + fraggap;
1398
1399			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1400				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1401			if ((flags & MSG_MORE) &&
1402			    !(rt->dst.dev->features&NETIF_F_SG))
1403				alloclen = mtu;
1404			else
1405				alloclen = datalen + fragheaderlen;
1406
1407			alloclen += dst_exthdrlen;
1408
1409			if (datalen != length + fraggap) {
1410				/*
1411				 * this is not the last fragment, the trailer
1412				 * space is regarded as data space.
1413				 */
1414				datalen += rt->dst.trailer_len;
1415			}
1416
1417			alloclen += rt->dst.trailer_len;
1418			fraglen = datalen + fragheaderlen;
1419
1420			/*
1421			 * We just reserve space for fragment header.
1422			 * Note: this may be overallocation if the message
1423			 * (without MSG_MORE) fits into the MTU.
1424			 */
1425			alloclen += sizeof(struct frag_hdr);
1426
1427			if (transhdrlen) {
1428				skb = sock_alloc_send_skb(sk,
1429						alloclen + hh_len,
1430						(flags & MSG_DONTWAIT), &err);
1431			} else {
1432				skb = NULL;
1433				if (atomic_read(&sk->sk_wmem_alloc) <=
1434				    2 * sk->sk_sndbuf)
1435					skb = sock_wmalloc(sk,
1436							   alloclen + hh_len, 1,
1437							   sk->sk_allocation);
1438				if (unlikely(!skb))
1439					err = -ENOBUFS;
1440			}
1441			if (!skb)
1442				goto error;
1443			/*
1444			 *	Fill in the control structures
1445			 */
1446			skb->protocol = htons(ETH_P_IPV6);
1447			skb->ip_summed = csummode;
1448			skb->csum = 0;
1449			/* reserve for fragmentation and ipsec header */
1450			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1451				    dst_exthdrlen);
1452
1453			/* Only the initial fragment is time stamped */
1454			skb_shinfo(skb)->tx_flags = tx_flags;
1455			tx_flags = 0;
1456			skb_shinfo(skb)->tskey = tskey;
1457			tskey = 0;
1458
1459			/*
1460			 *	Find where to start putting bytes
1461			 */
1462			data = skb_put(skb, fraglen);
1463			skb_set_network_header(skb, exthdrlen);
1464			data += fragheaderlen;
1465			skb->transport_header = (skb->network_header +
1466						 fragheaderlen);
1467			if (fraggap) {
1468				skb->csum = skb_copy_and_csum_bits(
1469					skb_prev, maxfraglen,
1470					data + transhdrlen, fraggap, 0);
1471				skb_prev->csum = csum_sub(skb_prev->csum,
1472							  skb->csum);
1473				data += fraggap;
1474				pskb_trim_unique(skb_prev, maxfraglen);
1475			}
1476			copy = datalen - transhdrlen - fraggap;
1477
1478			if (copy < 0) {
1479				err = -EINVAL;
1480				kfree_skb(skb);
1481				goto error;
1482			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1483				err = -EFAULT;
1484				kfree_skb(skb);
1485				goto error;
1486			}
1487
1488			offset += copy;
1489			length -= datalen - fraggap;
1490			transhdrlen = 0;
1491			exthdrlen = 0;
1492			dst_exthdrlen = 0;
1493
1494			/*
1495			 * Put the packet on the pending queue
1496			 */
1497			__skb_queue_tail(queue, skb);
1498			continue;
1499		}
1500
1501		if (copy > length)
1502			copy = length;
1503
1504		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1505			unsigned int off;
1506
1507			off = skb->len;
1508			if (getfrag(from, skb_put(skb, copy),
1509						offset, copy, off, skb) < 0) {
1510				__skb_trim(skb, off);
1511				err = -EFAULT;
1512				goto error;
1513			}
1514		} else {
1515			int i = skb_shinfo(skb)->nr_frags;
1516
1517			err = -ENOMEM;
1518			if (!sk_page_frag_refill(sk, pfrag))
1519				goto error;
1520
1521			if (!skb_can_coalesce(skb, i, pfrag->page,
1522					      pfrag->offset)) {
1523				err = -EMSGSIZE;
1524				if (i == MAX_SKB_FRAGS)
1525					goto error;
1526
1527				__skb_fill_page_desc(skb, i, pfrag->page,
1528						     pfrag->offset, 0);
1529				skb_shinfo(skb)->nr_frags = ++i;
1530				get_page(pfrag->page);
1531			}
1532			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1533			if (getfrag(from,
1534				    page_address(pfrag->page) + pfrag->offset,
1535				    offset, copy, skb->len, skb) < 0)
1536				goto error_efault;
1537
1538			pfrag->offset += copy;
1539			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1540			skb->len += copy;
1541			skb->data_len += copy;
1542			skb->truesize += copy;
1543			atomic_add(copy, &sk->sk_wmem_alloc);
1544		}
1545		offset += copy;
1546		length -= copy;
1547	}
1548
1549	return 0;
1550
1551error_efault:
1552	err = -EFAULT;
1553error:
1554	cork->length -= length;
1555	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556	return err;
1557}
1558
1559int ip6_append_data(struct sock *sk,
1560		    int getfrag(void *from, char *to, int offset, int len,
1561				int odd, struct sk_buff *skb),
1562		    void *from, int length, int transhdrlen, int hlimit,
1563		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1564		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1565{
1566	struct inet_sock *inet = inet_sk(sk);
1567	struct ipv6_pinfo *np = inet6_sk(sk);
1568	int exthdrlen;
1569	int err;
1570
1571	if (flags&MSG_PROBE)
1572		return 0;
1573	if (skb_queue_empty(&sk->sk_write_queue)) {
1574		/*
1575		 * setup for corking
1576		 */
1577		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1578				     tclass, opt, rt, fl6);
1579		if (err)
1580			return err;
1581
1582		exthdrlen = (opt ? opt->opt_flen : 0);
1583		length += exthdrlen;
1584		transhdrlen += exthdrlen;
1585	} else {
1586		fl6 = &inet->cork.fl.u.ip6;
1587		transhdrlen = 0;
1588	}
1589
1590	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1591				 &np->cork, sk_page_frag(sk), getfrag,
1592				 from, length, transhdrlen, flags, dontfrag);
1593}
1594EXPORT_SYMBOL_GPL(ip6_append_data);
1595
1596static void ip6_cork_release(struct inet_cork_full *cork,
1597			     struct inet6_cork *v6_cork)
1598{
1599	if (v6_cork->opt) {
1600		kfree(v6_cork->opt->dst0opt);
1601		kfree(v6_cork->opt->dst1opt);
1602		kfree(v6_cork->opt->hopopt);
1603		kfree(v6_cork->opt->srcrt);
1604		kfree(v6_cork->opt);
1605		v6_cork->opt = NULL;
1606	}
1607
1608	if (cork->base.dst) {
1609		dst_release(cork->base.dst);
1610		cork->base.dst = NULL;
1611		cork->base.flags &= ~IPCORK_ALLFRAG;
1612	}
1613	memset(&cork->fl, 0, sizeof(cork->fl));
1614}
1615
1616struct sk_buff *__ip6_make_skb(struct sock *sk,
1617			       struct sk_buff_head *queue,
1618			       struct inet_cork_full *cork,
1619			       struct inet6_cork *v6_cork)
1620{
1621	struct sk_buff *skb, *tmp_skb;
1622	struct sk_buff **tail_skb;
1623	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1624	struct ipv6_pinfo *np = inet6_sk(sk);
1625	struct net *net = sock_net(sk);
1626	struct ipv6hdr *hdr;
1627	struct ipv6_txoptions *opt = v6_cork->opt;
1628	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1629	struct flowi6 *fl6 = &cork->fl.u.ip6;
1630	unsigned char proto = fl6->flowi6_proto;
1631
1632	skb = __skb_dequeue(queue);
1633	if (!skb)
1634		goto out;
1635	tail_skb = &(skb_shinfo(skb)->frag_list);
1636
1637	/* move skb->data to ip header from ext header */
1638	if (skb->data < skb_network_header(skb))
1639		__skb_pull(skb, skb_network_offset(skb));
1640	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1641		__skb_pull(tmp_skb, skb_network_header_len(skb));
1642		*tail_skb = tmp_skb;
1643		tail_skb = &(tmp_skb->next);
1644		skb->len += tmp_skb->len;
1645		skb->data_len += tmp_skb->len;
1646		skb->truesize += tmp_skb->truesize;
1647		tmp_skb->destructor = NULL;
1648		tmp_skb->sk = NULL;
1649	}
1650
1651	/* Allow local fragmentation. */
1652	skb->ignore_df = ip6_sk_ignore_df(sk);
1653
1654	*final_dst = fl6->daddr;
1655	__skb_pull(skb, skb_network_header_len(skb));
1656	if (opt && opt->opt_flen)
1657		ipv6_push_frag_opts(skb, opt, &proto);
1658	if (opt && opt->opt_nflen)
1659		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1660
1661	skb_push(skb, sizeof(struct ipv6hdr));
1662	skb_reset_network_header(skb);
1663	hdr = ipv6_hdr(skb);
1664
1665	ip6_flow_hdr(hdr, v6_cork->tclass,
1666		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1667					np->autoflowlabel, fl6));
1668	hdr->hop_limit = v6_cork->hop_limit;
1669	hdr->nexthdr = proto;
1670	hdr->saddr = fl6->saddr;
1671	hdr->daddr = *final_dst;
1672
1673	skb->priority = sk->sk_priority;
1674	skb->mark = sk->sk_mark;
1675
1676	skb_dst_set(skb, dst_clone(&rt->dst));
1677	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1678	if (proto == IPPROTO_ICMPV6) {
1679		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1680
1681		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1682		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1683	}
1684
1685	ip6_cork_release(cork, v6_cork);
1686out:
1687	return skb;
1688}
1689
1690int ip6_send_skb(struct sk_buff *skb)
1691{
1692	struct net *net = sock_net(skb->sk);
1693	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1694	int err;
1695
1696	err = ip6_local_out(net, skb->sk, skb);
1697	if (err) {
1698		if (err > 0)
1699			err = net_xmit_errno(err);
1700		if (err)
1701			IP6_INC_STATS(net, rt->rt6i_idev,
1702				      IPSTATS_MIB_OUTDISCARDS);
1703	}
1704
1705	return err;
1706}
1707
1708int ip6_push_pending_frames(struct sock *sk)
1709{
1710	struct sk_buff *skb;
1711
1712	skb = ip6_finish_skb(sk);
1713	if (!skb)
1714		return 0;
1715
1716	return ip6_send_skb(skb);
1717}
1718EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1719
1720static void __ip6_flush_pending_frames(struct sock *sk,
1721				       struct sk_buff_head *queue,
1722				       struct inet_cork_full *cork,
1723				       struct inet6_cork *v6_cork)
1724{
1725	struct sk_buff *skb;
1726
1727	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1728		if (skb_dst(skb))
1729			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1730				      IPSTATS_MIB_OUTDISCARDS);
1731		kfree_skb(skb);
1732	}
1733
1734	ip6_cork_release(cork, v6_cork);
1735}
1736
1737void ip6_flush_pending_frames(struct sock *sk)
1738{
1739	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1740				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1741}
1742EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1743
1744struct sk_buff *ip6_make_skb(struct sock *sk,
1745			     int getfrag(void *from, char *to, int offset,
1746					 int len, int odd, struct sk_buff *skb),
1747			     void *from, int length, int transhdrlen,
1748			     int hlimit, int tclass,
1749			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1750			     struct rt6_info *rt, unsigned int flags,
1751			     int dontfrag)
1752{
1753	struct inet_cork_full cork;
1754	struct inet6_cork v6_cork;
1755	struct sk_buff_head queue;
1756	int exthdrlen = (opt ? opt->opt_flen : 0);
1757	int err;
1758
1759	if (flags & MSG_PROBE)
1760		return NULL;
1761
1762	__skb_queue_head_init(&queue);
1763
1764	cork.base.flags = 0;
1765	cork.base.addr = 0;
1766	cork.base.opt = NULL;
1767	v6_cork.opt = NULL;
1768	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1769	if (err)
1770		return ERR_PTR(err);
1771
1772	if (dontfrag < 0)
1773		dontfrag = inet6_sk(sk)->dontfrag;
1774
1775	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1776				&current->task_frag, getfrag, from,
1777				length + exthdrlen, transhdrlen + exthdrlen,
1778				flags, dontfrag);
1779	if (err) {
1780		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1781		return ERR_PTR(err);
1782	}
1783
1784	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1785}
1786