1/*
2 * xfrm_policy.c
3 *
4 * Changes:
5 *	Mitsuru KANDA @USAGI
6 * 	Kazunori MIYAZAWA @USAGI
7 * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8 * 		IPv6 support
9 * 	Kazunori MIYAZAWA @USAGI
10 * 	YOSHIFUJI Hideaki
11 * 		Split up af-specific portion
12 *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
13 *
14 */
15
16#include <linux/err.h>
17#include <linux/slab.h>
18#include <linux/kmod.h>
19#include <linux/list.h>
20#include <linux/spinlock.h>
21#include <linux/workqueue.h>
22#include <linux/notifier.h>
23#include <linux/netdevice.h>
24#include <linux/netfilter.h>
25#include <linux/module.h>
26#include <linux/cache.h>
27#include <linux/audit.h>
28#include <net/dst.h>
29#include <net/flow.h>
30#include <net/xfrm.h>
31#include <net/ip.h>
32#ifdef CONFIG_XFRM_STATISTICS
33#include <net/snmp.h>
34#endif
35
36#include "xfrm_hash.h"
37
38#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
39#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
40#define XFRM_MAX_QUEUE_LEN	100
41
42struct xfrm_flo {
43	struct dst_entry *dst_orig;
44	u8 flags;
45};
46
47static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
48static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
49						__read_mostly;
50
51static struct kmem_cache *xfrm_dst_cache __read_mostly;
52
53static void xfrm_init_pmtu(struct dst_entry *dst);
54static int stale_bundle(struct dst_entry *dst);
55static int xfrm_bundle_ok(struct xfrm_dst *xdst);
56static void xfrm_policy_queue_process(unsigned long arg);
57
58static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
59static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
60						int dir);
61
62static inline bool
63__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
64{
65	const struct flowi4 *fl4 = &fl->u.ip4;
66
67	return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
68		addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
69		!((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
70		!((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
71		(fl4->flowi4_proto == sel->proto || !sel->proto) &&
72		(fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
73}
74
75static inline bool
76__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
77{
78	const struct flowi6 *fl6 = &fl->u.ip6;
79
80	return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
81		addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
82		!((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
83		!((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
84		(fl6->flowi6_proto == sel->proto || !sel->proto) &&
85		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
86}
87
88bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
89			 unsigned short family)
90{
91	switch (family) {
92	case AF_INET:
93		return __xfrm4_selector_match(sel, fl);
94	case AF_INET6:
95		return __xfrm6_selector_match(sel, fl);
96	}
97	return false;
98}
99
100static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
101{
102	struct xfrm_policy_afinfo *afinfo;
103
104	if (unlikely(family >= NPROTO))
105		return NULL;
106	rcu_read_lock();
107	afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
108	if (unlikely(!afinfo))
109		rcu_read_unlock();
110	return afinfo;
111}
112
113static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
114{
115	rcu_read_unlock();
116}
117
118static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
119						  const xfrm_address_t *saddr,
120						  const xfrm_address_t *daddr,
121						  int family)
122{
123	struct xfrm_policy_afinfo *afinfo;
124	struct dst_entry *dst;
125
126	afinfo = xfrm_policy_get_afinfo(family);
127	if (unlikely(afinfo == NULL))
128		return ERR_PTR(-EAFNOSUPPORT);
129
130	dst = afinfo->dst_lookup(net, tos, saddr, daddr);
131
132	xfrm_policy_put_afinfo(afinfo);
133
134	return dst;
135}
136
137static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
138						xfrm_address_t *prev_saddr,
139						xfrm_address_t *prev_daddr,
140						int family)
141{
142	struct net *net = xs_net(x);
143	xfrm_address_t *saddr = &x->props.saddr;
144	xfrm_address_t *daddr = &x->id.daddr;
145	struct dst_entry *dst;
146
147	if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
148		saddr = x->coaddr;
149		daddr = prev_daddr;
150	}
151	if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
152		saddr = prev_saddr;
153		daddr = x->coaddr;
154	}
155
156	dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
157
158	if (!IS_ERR(dst)) {
159		if (prev_saddr != saddr)
160			memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
161		if (prev_daddr != daddr)
162			memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
163	}
164
165	return dst;
166}
167
168static inline unsigned long make_jiffies(long secs)
169{
170	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
171		return MAX_SCHEDULE_TIMEOUT-1;
172	else
173		return secs*HZ;
174}
175
176static void xfrm_policy_timer(unsigned long data)
177{
178	struct xfrm_policy *xp = (struct xfrm_policy *)data;
179	unsigned long now = get_seconds();
180	long next = LONG_MAX;
181	int warn = 0;
182	int dir;
183
184	read_lock(&xp->lock);
185
186	if (unlikely(xp->walk.dead))
187		goto out;
188
189	dir = xfrm_policy_id2dir(xp->index);
190
191	if (xp->lft.hard_add_expires_seconds) {
192		long tmo = xp->lft.hard_add_expires_seconds +
193			xp->curlft.add_time - now;
194		if (tmo <= 0)
195			goto expired;
196		if (tmo < next)
197			next = tmo;
198	}
199	if (xp->lft.hard_use_expires_seconds) {
200		long tmo = xp->lft.hard_use_expires_seconds +
201			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
202		if (tmo <= 0)
203			goto expired;
204		if (tmo < next)
205			next = tmo;
206	}
207	if (xp->lft.soft_add_expires_seconds) {
208		long tmo = xp->lft.soft_add_expires_seconds +
209			xp->curlft.add_time - now;
210		if (tmo <= 0) {
211			warn = 1;
212			tmo = XFRM_KM_TIMEOUT;
213		}
214		if (tmo < next)
215			next = tmo;
216	}
217	if (xp->lft.soft_use_expires_seconds) {
218		long tmo = xp->lft.soft_use_expires_seconds +
219			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
220		if (tmo <= 0) {
221			warn = 1;
222			tmo = XFRM_KM_TIMEOUT;
223		}
224		if (tmo < next)
225			next = tmo;
226	}
227
228	if (warn)
229		km_policy_expired(xp, dir, 0, 0);
230	if (next != LONG_MAX &&
231	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
232		xfrm_pol_hold(xp);
233
234out:
235	read_unlock(&xp->lock);
236	xfrm_pol_put(xp);
237	return;
238
239expired:
240	read_unlock(&xp->lock);
241	if (!xfrm_policy_delete(xp, dir))
242		km_policy_expired(xp, dir, 1, 0);
243	xfrm_pol_put(xp);
244}
245
246static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
247{
248	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
249
250	if (unlikely(pol->walk.dead))
251		flo = NULL;
252	else
253		xfrm_pol_hold(pol);
254
255	return flo;
256}
257
258static int xfrm_policy_flo_check(struct flow_cache_object *flo)
259{
260	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
261
262	return !pol->walk.dead;
263}
264
265static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
266{
267	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
268}
269
270static const struct flow_cache_ops xfrm_policy_fc_ops = {
271	.get = xfrm_policy_flo_get,
272	.check = xfrm_policy_flo_check,
273	.delete = xfrm_policy_flo_delete,
274};
275
276/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
277 * SPD calls.
278 */
279
280struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
281{
282	struct xfrm_policy *policy;
283
284	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
285
286	if (policy) {
287		write_pnet(&policy->xp_net, net);
288		INIT_LIST_HEAD(&policy->walk.all);
289		INIT_HLIST_NODE(&policy->bydst);
290		INIT_HLIST_NODE(&policy->byidx);
291		rwlock_init(&policy->lock);
292		atomic_set(&policy->refcnt, 1);
293		skb_queue_head_init(&policy->polq.hold_queue);
294		setup_timer(&policy->timer, xfrm_policy_timer,
295				(unsigned long)policy);
296		setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
297			    (unsigned long)policy);
298		policy->flo.ops = &xfrm_policy_fc_ops;
299	}
300	return policy;
301}
302EXPORT_SYMBOL(xfrm_policy_alloc);
303
304/* Destroy xfrm_policy: descendant resources must be released to this moment. */
305
306void xfrm_policy_destroy(struct xfrm_policy *policy)
307{
308	BUG_ON(!policy->walk.dead);
309
310	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
311		BUG();
312
313	security_xfrm_policy_free(policy->security);
314	kfree(policy);
315}
316EXPORT_SYMBOL(xfrm_policy_destroy);
317
318static void xfrm_queue_purge(struct sk_buff_head *list)
319{
320	struct sk_buff *skb;
321
322	while ((skb = skb_dequeue(list)) != NULL)
323		kfree_skb(skb);
324}
325
326/* Rule must be locked. Release descentant resources, announce
327 * entry dead. The rule must be unlinked from lists to the moment.
328 */
329
330static void xfrm_policy_kill(struct xfrm_policy *policy)
331{
332	policy->walk.dead = 1;
333
334	atomic_inc(&policy->genid);
335
336	if (del_timer(&policy->polq.hold_timer))
337		xfrm_pol_put(policy);
338	xfrm_queue_purge(&policy->polq.hold_queue);
339
340	if (del_timer(&policy->timer))
341		xfrm_pol_put(policy);
342
343	xfrm_pol_put(policy);
344}
345
346static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
347
348static inline unsigned int idx_hash(struct net *net, u32 index)
349{
350	return __idx_hash(index, net->xfrm.policy_idx_hmask);
351}
352
353/* calculate policy hash thresholds */
354static void __get_hash_thresh(struct net *net,
355			      unsigned short family, int dir,
356			      u8 *dbits, u8 *sbits)
357{
358	switch (family) {
359	case AF_INET:
360		*dbits = net->xfrm.policy_bydst[dir].dbits4;
361		*sbits = net->xfrm.policy_bydst[dir].sbits4;
362		break;
363
364	case AF_INET6:
365		*dbits = net->xfrm.policy_bydst[dir].dbits6;
366		*sbits = net->xfrm.policy_bydst[dir].sbits6;
367		break;
368
369	default:
370		*dbits = 0;
371		*sbits = 0;
372	}
373}
374
375static struct hlist_head *policy_hash_bysel(struct net *net,
376					    const struct xfrm_selector *sel,
377					    unsigned short family, int dir)
378{
379	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
380	unsigned int hash;
381	u8 dbits;
382	u8 sbits;
383
384	__get_hash_thresh(net, family, dir, &dbits, &sbits);
385	hash = __sel_hash(sel, family, hmask, dbits, sbits);
386
387	return (hash == hmask + 1 ?
388		&net->xfrm.policy_inexact[dir] :
389		net->xfrm.policy_bydst[dir].table + hash);
390}
391
392static struct hlist_head *policy_hash_direct(struct net *net,
393					     const xfrm_address_t *daddr,
394					     const xfrm_address_t *saddr,
395					     unsigned short family, int dir)
396{
397	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
398	unsigned int hash;
399	u8 dbits;
400	u8 sbits;
401
402	__get_hash_thresh(net, family, dir, &dbits, &sbits);
403	hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
404
405	return net->xfrm.policy_bydst[dir].table + hash;
406}
407
408static void xfrm_dst_hash_transfer(struct net *net,
409				   struct hlist_head *list,
410				   struct hlist_head *ndsttable,
411				   unsigned int nhashmask,
412				   int dir)
413{
414	struct hlist_node *tmp, *entry0 = NULL;
415	struct xfrm_policy *pol;
416	unsigned int h0 = 0;
417	u8 dbits;
418	u8 sbits;
419
420redo:
421	hlist_for_each_entry_safe(pol, tmp, list, bydst) {
422		unsigned int h;
423
424		__get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
425		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
426				pol->family, nhashmask, dbits, sbits);
427		if (!entry0) {
428			hlist_del(&pol->bydst);
429			hlist_add_head(&pol->bydst, ndsttable+h);
430			h0 = h;
431		} else {
432			if (h != h0)
433				continue;
434			hlist_del(&pol->bydst);
435			hlist_add_behind(&pol->bydst, entry0);
436		}
437		entry0 = &pol->bydst;
438	}
439	if (!hlist_empty(list)) {
440		entry0 = NULL;
441		goto redo;
442	}
443}
444
445static void xfrm_idx_hash_transfer(struct hlist_head *list,
446				   struct hlist_head *nidxtable,
447				   unsigned int nhashmask)
448{
449	struct hlist_node *tmp;
450	struct xfrm_policy *pol;
451
452	hlist_for_each_entry_safe(pol, tmp, list, byidx) {
453		unsigned int h;
454
455		h = __idx_hash(pol->index, nhashmask);
456		hlist_add_head(&pol->byidx, nidxtable+h);
457	}
458}
459
460static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
461{
462	return ((old_hmask + 1) << 1) - 1;
463}
464
465static void xfrm_bydst_resize(struct net *net, int dir)
466{
467	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
468	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
469	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
470	struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
471	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
472	int i;
473
474	if (!ndst)
475		return;
476
477	write_lock_bh(&net->xfrm.xfrm_policy_lock);
478
479	for (i = hmask; i >= 0; i--)
480		xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
481
482	net->xfrm.policy_bydst[dir].table = ndst;
483	net->xfrm.policy_bydst[dir].hmask = nhashmask;
484
485	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
486
487	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
488}
489
490static void xfrm_byidx_resize(struct net *net, int total)
491{
492	unsigned int hmask = net->xfrm.policy_idx_hmask;
493	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
494	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
495	struct hlist_head *oidx = net->xfrm.policy_byidx;
496	struct hlist_head *nidx = xfrm_hash_alloc(nsize);
497	int i;
498
499	if (!nidx)
500		return;
501
502	write_lock_bh(&net->xfrm.xfrm_policy_lock);
503
504	for (i = hmask; i >= 0; i--)
505		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
506
507	net->xfrm.policy_byidx = nidx;
508	net->xfrm.policy_idx_hmask = nhashmask;
509
510	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
511
512	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
513}
514
515static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
516{
517	unsigned int cnt = net->xfrm.policy_count[dir];
518	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
519
520	if (total)
521		*total += cnt;
522
523	if ((hmask + 1) < xfrm_policy_hashmax &&
524	    cnt > hmask)
525		return 1;
526
527	return 0;
528}
529
530static inline int xfrm_byidx_should_resize(struct net *net, int total)
531{
532	unsigned int hmask = net->xfrm.policy_idx_hmask;
533
534	if ((hmask + 1) < xfrm_policy_hashmax &&
535	    total > hmask)
536		return 1;
537
538	return 0;
539}
540
541void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
542{
543	read_lock_bh(&net->xfrm.xfrm_policy_lock);
544	si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
545	si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
546	si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
547	si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
548	si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
549	si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
550	si->spdhcnt = net->xfrm.policy_idx_hmask;
551	si->spdhmcnt = xfrm_policy_hashmax;
552	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
553}
554EXPORT_SYMBOL(xfrm_spd_getinfo);
555
556static DEFINE_MUTEX(hash_resize_mutex);
557static void xfrm_hash_resize(struct work_struct *work)
558{
559	struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
560	int dir, total;
561
562	mutex_lock(&hash_resize_mutex);
563
564	total = 0;
565	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
566		if (xfrm_bydst_should_resize(net, dir, &total))
567			xfrm_bydst_resize(net, dir);
568	}
569	if (xfrm_byidx_should_resize(net, total))
570		xfrm_byidx_resize(net, total);
571
572	mutex_unlock(&hash_resize_mutex);
573}
574
575static void xfrm_hash_rebuild(struct work_struct *work)
576{
577	struct net *net = container_of(work, struct net,
578				       xfrm.policy_hthresh.work);
579	unsigned int hmask;
580	struct xfrm_policy *pol;
581	struct xfrm_policy *policy;
582	struct hlist_head *chain;
583	struct hlist_head *odst;
584	struct hlist_node *newpos;
585	int i;
586	int dir;
587	unsigned seq;
588	u8 lbits4, rbits4, lbits6, rbits6;
589
590	mutex_lock(&hash_resize_mutex);
591
592	/* read selector prefixlen thresholds */
593	do {
594		seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
595
596		lbits4 = net->xfrm.policy_hthresh.lbits4;
597		rbits4 = net->xfrm.policy_hthresh.rbits4;
598		lbits6 = net->xfrm.policy_hthresh.lbits6;
599		rbits6 = net->xfrm.policy_hthresh.rbits6;
600	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
601
602	write_lock_bh(&net->xfrm.xfrm_policy_lock);
603
604	/* reset the bydst and inexact table in all directions */
605	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
606		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
607		hmask = net->xfrm.policy_bydst[dir].hmask;
608		odst = net->xfrm.policy_bydst[dir].table;
609		for (i = hmask; i >= 0; i--)
610			INIT_HLIST_HEAD(odst + i);
611		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
612			/* dir out => dst = remote, src = local */
613			net->xfrm.policy_bydst[dir].dbits4 = rbits4;
614			net->xfrm.policy_bydst[dir].sbits4 = lbits4;
615			net->xfrm.policy_bydst[dir].dbits6 = rbits6;
616			net->xfrm.policy_bydst[dir].sbits6 = lbits6;
617		} else {
618			/* dir in/fwd => dst = local, src = remote */
619			net->xfrm.policy_bydst[dir].dbits4 = lbits4;
620			net->xfrm.policy_bydst[dir].sbits4 = rbits4;
621			net->xfrm.policy_bydst[dir].dbits6 = lbits6;
622			net->xfrm.policy_bydst[dir].sbits6 = rbits6;
623		}
624	}
625
626	/* re-insert all policies by order of creation */
627	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
628		newpos = NULL;
629		chain = policy_hash_bysel(net, &policy->selector,
630					  policy->family,
631					  xfrm_policy_id2dir(policy->index));
632		hlist_for_each_entry(pol, chain, bydst) {
633			if (policy->priority >= pol->priority)
634				newpos = &pol->bydst;
635			else
636				break;
637		}
638		if (newpos)
639			hlist_add_behind(&policy->bydst, newpos);
640		else
641			hlist_add_head(&policy->bydst, chain);
642	}
643
644	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
645
646	mutex_unlock(&hash_resize_mutex);
647}
648
649void xfrm_policy_hash_rebuild(struct net *net)
650{
651	schedule_work(&net->xfrm.policy_hthresh.work);
652}
653EXPORT_SYMBOL(xfrm_policy_hash_rebuild);
654
655/* Generate new index... KAME seems to generate them ordered by cost
656 * of an absolute inpredictability of ordering of rules. This will not pass. */
657static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
658{
659	static u32 idx_generator;
660
661	for (;;) {
662		struct hlist_head *list;
663		struct xfrm_policy *p;
664		u32 idx;
665		int found;
666
667		if (!index) {
668			idx = (idx_generator | dir);
669			idx_generator += 8;
670		} else {
671			idx = index;
672			index = 0;
673		}
674
675		if (idx == 0)
676			idx = 8;
677		list = net->xfrm.policy_byidx + idx_hash(net, idx);
678		found = 0;
679		hlist_for_each_entry(p, list, byidx) {
680			if (p->index == idx) {
681				found = 1;
682				break;
683			}
684		}
685		if (!found)
686			return idx;
687	}
688}
689
690static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
691{
692	u32 *p1 = (u32 *) s1;
693	u32 *p2 = (u32 *) s2;
694	int len = sizeof(struct xfrm_selector) / sizeof(u32);
695	int i;
696
697	for (i = 0; i < len; i++) {
698		if (p1[i] != p2[i])
699			return 1;
700	}
701
702	return 0;
703}
704
705static void xfrm_policy_requeue(struct xfrm_policy *old,
706				struct xfrm_policy *new)
707{
708	struct xfrm_policy_queue *pq = &old->polq;
709	struct sk_buff_head list;
710
711	__skb_queue_head_init(&list);
712
713	spin_lock_bh(&pq->hold_queue.lock);
714	skb_queue_splice_init(&pq->hold_queue, &list);
715	if (del_timer(&pq->hold_timer))
716		xfrm_pol_put(old);
717	spin_unlock_bh(&pq->hold_queue.lock);
718
719	if (skb_queue_empty(&list))
720		return;
721
722	pq = &new->polq;
723
724	spin_lock_bh(&pq->hold_queue.lock);
725	skb_queue_splice(&list, &pq->hold_queue);
726	pq->timeout = XFRM_QUEUE_TMO_MIN;
727	if (!mod_timer(&pq->hold_timer, jiffies))
728		xfrm_pol_hold(new);
729	spin_unlock_bh(&pq->hold_queue.lock);
730}
731
732static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
733				   struct xfrm_policy *pol)
734{
735	u32 mark = policy->mark.v & policy->mark.m;
736
737	if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m)
738		return true;
739
740	if ((mark & pol->mark.m) == pol->mark.v &&
741	    policy->priority == pol->priority)
742		return true;
743
744	return false;
745}
746
747int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
748{
749	struct net *net = xp_net(policy);
750	struct xfrm_policy *pol;
751	struct xfrm_policy *delpol;
752	struct hlist_head *chain;
753	struct hlist_node *newpos;
754
755	write_lock_bh(&net->xfrm.xfrm_policy_lock);
756	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
757	delpol = NULL;
758	newpos = NULL;
759	hlist_for_each_entry(pol, chain, bydst) {
760		if (pol->type == policy->type &&
761		    !selector_cmp(&pol->selector, &policy->selector) &&
762		    xfrm_policy_mark_match(policy, pol) &&
763		    xfrm_sec_ctx_match(pol->security, policy->security) &&
764		    !WARN_ON(delpol)) {
765			if (excl) {
766				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
767				return -EEXIST;
768			}
769			delpol = pol;
770			if (policy->priority > pol->priority)
771				continue;
772		} else if (policy->priority >= pol->priority) {
773			newpos = &pol->bydst;
774			continue;
775		}
776		if (delpol)
777			break;
778	}
779	if (newpos)
780		hlist_add_behind(&policy->bydst, newpos);
781	else
782		hlist_add_head(&policy->bydst, chain);
783	__xfrm_policy_link(policy, dir);
784	atomic_inc(&net->xfrm.flow_cache_genid);
785
786	/* After previous checking, family can either be AF_INET or AF_INET6 */
787	if (policy->family == AF_INET)
788		rt_genid_bump_ipv4(net);
789	else
790		rt_genid_bump_ipv6(net);
791
792	if (delpol) {
793		xfrm_policy_requeue(delpol, policy);
794		__xfrm_policy_unlink(delpol, dir);
795	}
796	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
797	hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
798	policy->curlft.add_time = get_seconds();
799	policy->curlft.use_time = 0;
800	if (!mod_timer(&policy->timer, jiffies + HZ))
801		xfrm_pol_hold(policy);
802	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
803
804	if (delpol)
805		xfrm_policy_kill(delpol);
806	else if (xfrm_bydst_should_resize(net, dir, NULL))
807		schedule_work(&net->xfrm.policy_hash_work);
808
809	return 0;
810}
811EXPORT_SYMBOL(xfrm_policy_insert);
812
813struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
814					  int dir, struct xfrm_selector *sel,
815					  struct xfrm_sec_ctx *ctx, int delete,
816					  int *err)
817{
818	struct xfrm_policy *pol, *ret;
819	struct hlist_head *chain;
820
821	*err = 0;
822	write_lock_bh(&net->xfrm.xfrm_policy_lock);
823	chain = policy_hash_bysel(net, sel, sel->family, dir);
824	ret = NULL;
825	hlist_for_each_entry(pol, chain, bydst) {
826		if (pol->type == type &&
827		    (mark & pol->mark.m) == pol->mark.v &&
828		    !selector_cmp(sel, &pol->selector) &&
829		    xfrm_sec_ctx_match(ctx, pol->security)) {
830			xfrm_pol_hold(pol);
831			if (delete) {
832				*err = security_xfrm_policy_delete(
833								pol->security);
834				if (*err) {
835					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
836					return pol;
837				}
838				__xfrm_policy_unlink(pol, dir);
839			}
840			ret = pol;
841			break;
842		}
843	}
844	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
845
846	if (ret && delete)
847		xfrm_policy_kill(ret);
848	return ret;
849}
850EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
851
852struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
853				     int dir, u32 id, int delete, int *err)
854{
855	struct xfrm_policy *pol, *ret;
856	struct hlist_head *chain;
857
858	*err = -ENOENT;
859	if (xfrm_policy_id2dir(id) != dir)
860		return NULL;
861
862	*err = 0;
863	write_lock_bh(&net->xfrm.xfrm_policy_lock);
864	chain = net->xfrm.policy_byidx + idx_hash(net, id);
865	ret = NULL;
866	hlist_for_each_entry(pol, chain, byidx) {
867		if (pol->type == type && pol->index == id &&
868		    (mark & pol->mark.m) == pol->mark.v) {
869			xfrm_pol_hold(pol);
870			if (delete) {
871				*err = security_xfrm_policy_delete(
872								pol->security);
873				if (*err) {
874					write_unlock_bh(&net->xfrm.xfrm_policy_lock);
875					return pol;
876				}
877				__xfrm_policy_unlink(pol, dir);
878			}
879			ret = pol;
880			break;
881		}
882	}
883	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
884
885	if (ret && delete)
886		xfrm_policy_kill(ret);
887	return ret;
888}
889EXPORT_SYMBOL(xfrm_policy_byid);
890
891#ifdef CONFIG_SECURITY_NETWORK_XFRM
892static inline int
893xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
894{
895	int dir, err = 0;
896
897	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
898		struct xfrm_policy *pol;
899		int i;
900
901		hlist_for_each_entry(pol,
902				     &net->xfrm.policy_inexact[dir], bydst) {
903			if (pol->type != type)
904				continue;
905			err = security_xfrm_policy_delete(pol->security);
906			if (err) {
907				xfrm_audit_policy_delete(pol, 0, task_valid);
908				return err;
909			}
910		}
911		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
912			hlist_for_each_entry(pol,
913					     net->xfrm.policy_bydst[dir].table + i,
914					     bydst) {
915				if (pol->type != type)
916					continue;
917				err = security_xfrm_policy_delete(
918								pol->security);
919				if (err) {
920					xfrm_audit_policy_delete(pol, 0,
921								 task_valid);
922					return err;
923				}
924			}
925		}
926	}
927	return err;
928}
929#else
930static inline int
931xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
932{
933	return 0;
934}
935#endif
936
937int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
938{
939	int dir, err = 0, cnt = 0;
940
941	write_lock_bh(&net->xfrm.xfrm_policy_lock);
942
943	err = xfrm_policy_flush_secctx_check(net, type, task_valid);
944	if (err)
945		goto out;
946
947	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
948		struct xfrm_policy *pol;
949		int i;
950
951	again1:
952		hlist_for_each_entry(pol,
953				     &net->xfrm.policy_inexact[dir], bydst) {
954			if (pol->type != type)
955				continue;
956			__xfrm_policy_unlink(pol, dir);
957			write_unlock_bh(&net->xfrm.xfrm_policy_lock);
958			cnt++;
959
960			xfrm_audit_policy_delete(pol, 1, task_valid);
961
962			xfrm_policy_kill(pol);
963
964			write_lock_bh(&net->xfrm.xfrm_policy_lock);
965			goto again1;
966		}
967
968		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
969	again2:
970			hlist_for_each_entry(pol,
971					     net->xfrm.policy_bydst[dir].table + i,
972					     bydst) {
973				if (pol->type != type)
974					continue;
975				__xfrm_policy_unlink(pol, dir);
976				write_unlock_bh(&net->xfrm.xfrm_policy_lock);
977				cnt++;
978
979				xfrm_audit_policy_delete(pol, 1, task_valid);
980				xfrm_policy_kill(pol);
981
982				write_lock_bh(&net->xfrm.xfrm_policy_lock);
983				goto again2;
984			}
985		}
986
987	}
988	if (!cnt)
989		err = -ESRCH;
990out:
991	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
992	return err;
993}
994EXPORT_SYMBOL(xfrm_policy_flush);
995
996int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
997		     int (*func)(struct xfrm_policy *, int, int, void*),
998		     void *data)
999{
1000	struct xfrm_policy *pol;
1001	struct xfrm_policy_walk_entry *x;
1002	int error = 0;
1003
1004	if (walk->type >= XFRM_POLICY_TYPE_MAX &&
1005	    walk->type != XFRM_POLICY_TYPE_ANY)
1006		return -EINVAL;
1007
1008	if (list_empty(&walk->walk.all) && walk->seq != 0)
1009		return 0;
1010
1011	write_lock_bh(&net->xfrm.xfrm_policy_lock);
1012	if (list_empty(&walk->walk.all))
1013		x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
1014	else
1015		x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
1016	list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
1017		if (x->dead)
1018			continue;
1019		pol = container_of(x, struct xfrm_policy, walk);
1020		if (walk->type != XFRM_POLICY_TYPE_ANY &&
1021		    walk->type != pol->type)
1022			continue;
1023		error = func(pol, xfrm_policy_id2dir(pol->index),
1024			     walk->seq, data);
1025		if (error) {
1026			list_move_tail(&walk->walk.all, &x->all);
1027			goto out;
1028		}
1029		walk->seq++;
1030	}
1031	if (walk->seq == 0) {
1032		error = -ENOENT;
1033		goto out;
1034	}
1035	list_del_init(&walk->walk.all);
1036out:
1037	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
1038	return error;
1039}
1040EXPORT_SYMBOL(xfrm_policy_walk);
1041
1042void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
1043{
1044	INIT_LIST_HEAD(&walk->walk.all);
1045	walk->walk.dead = 1;
1046	walk->type = type;
1047	walk->seq = 0;
1048}
1049EXPORT_SYMBOL(xfrm_policy_walk_init);
1050
1051void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
1052{
1053	if (list_empty(&walk->walk.all))
1054		return;
1055
1056	write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
1057	list_del(&walk->walk.all);
1058	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
1059}
1060EXPORT_SYMBOL(xfrm_policy_walk_done);
1061
1062/*
1063 * Find policy to apply to this flow.
1064 *
1065 * Returns 0 if policy found, else an -errno.
1066 */
1067static int xfrm_policy_match(const struct xfrm_policy *pol,
1068			     const struct flowi *fl,
1069			     u8 type, u16 family, int dir)
1070{
1071	const struct xfrm_selector *sel = &pol->selector;
1072	int ret = -ESRCH;
1073	bool match;
1074
1075	if (pol->family != family ||
1076	    (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
1077	    pol->type != type)
1078		return ret;
1079
1080	match = xfrm_selector_match(sel, fl, family);
1081	if (match)
1082		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
1083						  dir);
1084
1085	return ret;
1086}
1087
1088static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
1089						     const struct flowi *fl,
1090						     u16 family, u8 dir)
1091{
1092	int err;
1093	struct xfrm_policy *pol, *ret;
1094	const xfrm_address_t *daddr, *saddr;
1095	struct hlist_head *chain;
1096	u32 priority = ~0U;
1097
1098	daddr = xfrm_flowi_daddr(fl, family);
1099	saddr = xfrm_flowi_saddr(fl, family);
1100	if (unlikely(!daddr || !saddr))
1101		return NULL;
1102
1103	read_lock_bh(&net->xfrm.xfrm_policy_lock);
1104	chain = policy_hash_direct(net, daddr, saddr, family, dir);
1105	ret = NULL;
1106	hlist_for_each_entry(pol, chain, bydst) {
1107		err = xfrm_policy_match(pol, fl, type, family, dir);
1108		if (err) {
1109			if (err == -ESRCH)
1110				continue;
1111			else {
1112				ret = ERR_PTR(err);
1113				goto fail;
1114			}
1115		} else {
1116			ret = pol;
1117			priority = ret->priority;
1118			break;
1119		}
1120	}
1121	chain = &net->xfrm.policy_inexact[dir];
1122	hlist_for_each_entry(pol, chain, bydst) {
1123		err = xfrm_policy_match(pol, fl, type, family, dir);
1124		if (err) {
1125			if (err == -ESRCH)
1126				continue;
1127			else {
1128				ret = ERR_PTR(err);
1129				goto fail;
1130			}
1131		} else if (pol->priority < priority) {
1132			ret = pol;
1133			break;
1134		}
1135	}
1136	if (ret)
1137		xfrm_pol_hold(ret);
1138fail:
1139	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
1140
1141	return ret;
1142}
1143
1144static struct xfrm_policy *
1145__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
1146{
1147#ifdef CONFIG_XFRM_SUB_POLICY
1148	struct xfrm_policy *pol;
1149
1150	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
1151	if (pol != NULL)
1152		return pol;
1153#endif
1154	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
1155}
1156
1157static int flow_to_policy_dir(int dir)
1158{
1159	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
1160	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
1161	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
1162		return dir;
1163
1164	switch (dir) {
1165	default:
1166	case FLOW_DIR_IN:
1167		return XFRM_POLICY_IN;
1168	case FLOW_DIR_OUT:
1169		return XFRM_POLICY_OUT;
1170	case FLOW_DIR_FWD:
1171		return XFRM_POLICY_FWD;
1172	}
1173}
1174
1175static struct flow_cache_object *
1176xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
1177		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
1178{
1179	struct xfrm_policy *pol;
1180
1181	if (old_obj)
1182		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
1183
1184	pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
1185	if (IS_ERR_OR_NULL(pol))
1186		return ERR_CAST(pol);
1187
1188	/* Resolver returns two references:
1189	 * one for cache and one for caller of flow_cache_lookup() */
1190	xfrm_pol_hold(pol);
1191
1192	return &pol->flo;
1193}
1194
1195static inline int policy_to_flow_dir(int dir)
1196{
1197	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
1198	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
1199	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
1200		return dir;
1201	switch (dir) {
1202	default:
1203	case XFRM_POLICY_IN:
1204		return FLOW_DIR_IN;
1205	case XFRM_POLICY_OUT:
1206		return FLOW_DIR_OUT;
1207	case XFRM_POLICY_FWD:
1208		return FLOW_DIR_FWD;
1209	}
1210}
1211
1212static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
1213						 const struct flowi *fl)
1214{
1215	struct xfrm_policy *pol;
1216	struct net *net = sock_net(sk);
1217
1218	read_lock_bh(&net->xfrm.xfrm_policy_lock);
1219	if ((pol = sk->sk_policy[dir]) != NULL) {
1220		bool match = xfrm_selector_match(&pol->selector, fl,
1221						 sk->sk_family);
1222		int err = 0;
1223
1224		if (match) {
1225			if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
1226				pol = NULL;
1227				goto out;
1228			}
1229			err = security_xfrm_policy_lookup(pol->security,
1230						      fl->flowi_secid,
1231						      policy_to_flow_dir(dir));
1232			if (!err)
1233				xfrm_pol_hold(pol);
1234			else if (err == -ESRCH)
1235				pol = NULL;
1236			else
1237				pol = ERR_PTR(err);
1238		} else
1239			pol = NULL;
1240	}
1241out:
1242	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
1243	return pol;
1244}
1245
1246static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
1247{
1248	struct net *net = xp_net(pol);
1249
1250	list_add(&pol->walk.all, &net->xfrm.policy_all);
1251	net->xfrm.policy_count[dir]++;
1252	xfrm_pol_hold(pol);
1253}
1254
1255static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
1256						int dir)
1257{
1258	struct net *net = xp_net(pol);
1259
1260	if (list_empty(&pol->walk.all))
1261		return NULL;
1262
1263	/* Socket policies are not hashed. */
1264	if (!hlist_unhashed(&pol->bydst)) {
1265		hlist_del(&pol->bydst);
1266		hlist_del(&pol->byidx);
1267	}
1268
1269	list_del_init(&pol->walk.all);
1270	net->xfrm.policy_count[dir]--;
1271
1272	return pol;
1273}
1274
1275static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
1276{
1277	__xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
1278}
1279
1280static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
1281{
1282	__xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
1283}
1284
1285int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
1286{
1287	struct net *net = xp_net(pol);
1288
1289	write_lock_bh(&net->xfrm.xfrm_policy_lock);
1290	pol = __xfrm_policy_unlink(pol, dir);
1291	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
1292	if (pol) {
1293		xfrm_policy_kill(pol);
1294		return 0;
1295	}
1296	return -ENOENT;
1297}
1298EXPORT_SYMBOL(xfrm_policy_delete);
1299
1300int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1301{
1302	struct net *net = xp_net(pol);
1303	struct xfrm_policy *old_pol;
1304
1305#ifdef CONFIG_XFRM_SUB_POLICY
1306	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
1307		return -EINVAL;
1308#endif
1309
1310	write_lock_bh(&net->xfrm.xfrm_policy_lock);
1311	old_pol = sk->sk_policy[dir];
1312	sk->sk_policy[dir] = pol;
1313	if (pol) {
1314		pol->curlft.add_time = get_seconds();
1315		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
1316		xfrm_sk_policy_link(pol, dir);
1317	}
1318	if (old_pol) {
1319		if (pol)
1320			xfrm_policy_requeue(old_pol, pol);
1321
1322		/* Unlinking succeeds always. This is the only function
1323		 * allowed to delete or replace socket policy.
1324		 */
1325		xfrm_sk_policy_unlink(old_pol, dir);
1326	}
1327	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
1328
1329	if (old_pol) {
1330		xfrm_policy_kill(old_pol);
1331	}
1332	return 0;
1333}
1334
1335static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
1336{
1337	struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
1338	struct net *net = xp_net(old);
1339
1340	if (newp) {
1341		newp->selector = old->selector;
1342		if (security_xfrm_policy_clone(old->security,
1343					       &newp->security)) {
1344			kfree(newp);
1345			return NULL;  /* ENOMEM */
1346		}
1347		newp->lft = old->lft;
1348		newp->curlft = old->curlft;
1349		newp->mark = old->mark;
1350		newp->action = old->action;
1351		newp->flags = old->flags;
1352		newp->xfrm_nr = old->xfrm_nr;
1353		newp->index = old->index;
1354		newp->type = old->type;
1355		memcpy(newp->xfrm_vec, old->xfrm_vec,
1356		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
1357		write_lock_bh(&net->xfrm.xfrm_policy_lock);
1358		xfrm_sk_policy_link(newp, dir);
1359		write_unlock_bh(&net->xfrm.xfrm_policy_lock);
1360		xfrm_pol_put(newp);
1361	}
1362	return newp;
1363}
1364
1365int __xfrm_sk_clone_policy(struct sock *sk)
1366{
1367	struct xfrm_policy *p0 = sk->sk_policy[0],
1368			   *p1 = sk->sk_policy[1];
1369
1370	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
1371	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
1372		return -ENOMEM;
1373	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
1374		return -ENOMEM;
1375	return 0;
1376}
1377
1378static int
1379xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
1380	       unsigned short family)
1381{
1382	int err;
1383	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1384
1385	if (unlikely(afinfo == NULL))
1386		return -EINVAL;
1387	err = afinfo->get_saddr(net, local, remote);
1388	xfrm_policy_put_afinfo(afinfo);
1389	return err;
1390}
1391
1392/* Resolve list of templates for the flow, given policy. */
1393
1394static int
1395xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
1396		      struct xfrm_state **xfrm, unsigned short family)
1397{
1398	struct net *net = xp_net(policy);
1399	int nx;
1400	int i, error;
1401	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
1402	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
1403	xfrm_address_t tmp;
1404
1405	for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
1406		struct xfrm_state *x;
1407		xfrm_address_t *remote = daddr;
1408		xfrm_address_t *local  = saddr;
1409		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
1410
1411		if (tmpl->mode == XFRM_MODE_TUNNEL ||
1412		    tmpl->mode == XFRM_MODE_BEET) {
1413			remote = &tmpl->id.daddr;
1414			local = &tmpl->saddr;
1415			if (xfrm_addr_any(local, tmpl->encap_family)) {
1416				error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
1417				if (error)
1418					goto fail;
1419				local = &tmp;
1420			}
1421		}
1422
1423		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
1424
1425		if (x && x->km.state == XFRM_STATE_VALID) {
1426			xfrm[nx++] = x;
1427			daddr = remote;
1428			saddr = local;
1429			continue;
1430		}
1431		if (x) {
1432			error = (x->km.state == XFRM_STATE_ERROR ?
1433				 -EINVAL : -EAGAIN);
1434			xfrm_state_put(x);
1435		} else if (error == -ESRCH) {
1436			error = -EAGAIN;
1437		}
1438
1439		if (!tmpl->optional)
1440			goto fail;
1441	}
1442	return nx;
1443
1444fail:
1445	for (nx--; nx >= 0; nx--)
1446		xfrm_state_put(xfrm[nx]);
1447	return error;
1448}
1449
1450static int
1451xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
1452		  struct xfrm_state **xfrm, unsigned short family)
1453{
1454	struct xfrm_state *tp[XFRM_MAX_DEPTH];
1455	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
1456	int cnx = 0;
1457	int error;
1458	int ret;
1459	int i;
1460
1461	for (i = 0; i < npols; i++) {
1462		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
1463			error = -ENOBUFS;
1464			goto fail;
1465		}
1466
1467		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
1468		if (ret < 0) {
1469			error = ret;
1470			goto fail;
1471		} else
1472			cnx += ret;
1473	}
1474
1475	/* found states are sorted for outbound processing */
1476	if (npols > 1)
1477		xfrm_state_sort(xfrm, tpp, cnx, family);
1478
1479	return cnx;
1480
1481 fail:
1482	for (cnx--; cnx >= 0; cnx--)
1483		xfrm_state_put(tpp[cnx]);
1484	return error;
1485
1486}
1487
1488/* Check that the bundle accepts the flow and its components are
1489 * still valid.
1490 */
1491
1492static inline int xfrm_get_tos(const struct flowi *fl, int family)
1493{
1494	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1495	int tos;
1496
1497	if (!afinfo)
1498		return -EINVAL;
1499
1500	tos = afinfo->get_tos(fl);
1501
1502	xfrm_policy_put_afinfo(afinfo);
1503
1504	return tos;
1505}
1506
1507static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
1508{
1509	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1510	struct dst_entry *dst = &xdst->u.dst;
1511
1512	if (xdst->route == NULL) {
1513		/* Dummy bundle - if it has xfrms we were not
1514		 * able to build bundle as template resolution failed.
1515		 * It means we need to try again resolving. */
1516		if (xdst->num_xfrms > 0)
1517			return NULL;
1518	} else if (dst->flags & DST_XFRM_QUEUE) {
1519		return NULL;
1520	} else {
1521		/* Real bundle */
1522		if (stale_bundle(dst))
1523			return NULL;
1524	}
1525
1526	dst_hold(dst);
1527	return flo;
1528}
1529
1530static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
1531{
1532	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1533	struct dst_entry *dst = &xdst->u.dst;
1534
1535	if (!xdst->route)
1536		return 0;
1537	if (stale_bundle(dst))
1538		return 0;
1539
1540	return 1;
1541}
1542
1543static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
1544{
1545	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1546	struct dst_entry *dst = &xdst->u.dst;
1547
1548	dst_free(dst);
1549}
1550
1551static const struct flow_cache_ops xfrm_bundle_fc_ops = {
1552	.get = xfrm_bundle_flo_get,
1553	.check = xfrm_bundle_flo_check,
1554	.delete = xfrm_bundle_flo_delete,
1555};
1556
1557static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1558{
1559	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1560	struct dst_ops *dst_ops;
1561	struct xfrm_dst *xdst;
1562
1563	if (!afinfo)
1564		return ERR_PTR(-EINVAL);
1565
1566	switch (family) {
1567	case AF_INET:
1568		dst_ops = &net->xfrm.xfrm4_dst_ops;
1569		break;
1570#if IS_ENABLED(CONFIG_IPV6)
1571	case AF_INET6:
1572		dst_ops = &net->xfrm.xfrm6_dst_ops;
1573		break;
1574#endif
1575	default:
1576		BUG();
1577	}
1578	xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
1579
1580	if (likely(xdst)) {
1581		struct dst_entry *dst = &xdst->u.dst;
1582
1583		memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
1584		xdst->flo.ops = &xfrm_bundle_fc_ops;
1585		if (afinfo->init_dst)
1586			afinfo->init_dst(net, xdst);
1587	} else
1588		xdst = ERR_PTR(-ENOBUFS);
1589
1590	xfrm_policy_put_afinfo(afinfo);
1591
1592	return xdst;
1593}
1594
1595static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1596				 int nfheader_len)
1597{
1598	struct xfrm_policy_afinfo *afinfo =
1599		xfrm_policy_get_afinfo(dst->ops->family);
1600	int err;
1601
1602	if (!afinfo)
1603		return -EINVAL;
1604
1605	err = afinfo->init_path(path, dst, nfheader_len);
1606
1607	xfrm_policy_put_afinfo(afinfo);
1608
1609	return err;
1610}
1611
1612static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1613				const struct flowi *fl)
1614{
1615	struct xfrm_policy_afinfo *afinfo =
1616		xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
1617	int err;
1618
1619	if (!afinfo)
1620		return -EINVAL;
1621
1622	err = afinfo->fill_dst(xdst, dev, fl);
1623
1624	xfrm_policy_put_afinfo(afinfo);
1625
1626	return err;
1627}
1628
1629
1630/* Allocate chain of dst_entry's, attach known xfrm's, calculate
1631 * all the metrics... Shortly, bundle a bundle.
1632 */
1633
1634static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
1635					    struct xfrm_state **xfrm, int nx,
1636					    const struct flowi *fl,
1637					    struct dst_entry *dst)
1638{
1639	struct net *net = xp_net(policy);
1640	unsigned long now = jiffies;
1641	struct net_device *dev;
1642	struct xfrm_mode *inner_mode;
1643	struct dst_entry *dst_prev = NULL;
1644	struct dst_entry *dst0 = NULL;
1645	int i = 0;
1646	int err;
1647	int header_len = 0;
1648	int nfheader_len = 0;
1649	int trailer_len = 0;
1650	int tos;
1651	int family = policy->selector.family;
1652	xfrm_address_t saddr, daddr;
1653
1654	xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
1655
1656	tos = xfrm_get_tos(fl, family);
1657	err = tos;
1658	if (tos < 0)
1659		goto put_states;
1660
1661	dst_hold(dst);
1662
1663	for (; i < nx; i++) {
1664		struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
1665		struct dst_entry *dst1 = &xdst->u.dst;
1666
1667		err = PTR_ERR(xdst);
1668		if (IS_ERR(xdst)) {
1669			dst_release(dst);
1670			goto put_states;
1671		}
1672
1673		if (xfrm[i]->sel.family == AF_UNSPEC) {
1674			inner_mode = xfrm_ip2inner_mode(xfrm[i],
1675							xfrm_af2proto(family));
1676			if (!inner_mode) {
1677				err = -EAFNOSUPPORT;
1678				dst_release(dst);
1679				goto put_states;
1680			}
1681		} else
1682			inner_mode = xfrm[i]->inner_mode;
1683
1684		if (!dst_prev)
1685			dst0 = dst1;
1686		else {
1687			dst_prev->child = dst_clone(dst1);
1688			dst1->flags |= DST_NOHASH;
1689		}
1690
1691		xdst->route = dst;
1692		dst_copy_metrics(dst1, dst);
1693
1694		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
1695			family = xfrm[i]->props.family;
1696			dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
1697					      family);
1698			err = PTR_ERR(dst);
1699			if (IS_ERR(dst))
1700				goto put_states;
1701		} else
1702			dst_hold(dst);
1703
1704		dst1->xfrm = xfrm[i];
1705		xdst->xfrm_genid = xfrm[i]->genid;
1706
1707		dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
1708		dst1->flags |= DST_HOST;
1709		dst1->lastuse = now;
1710
1711		dst1->input = dst_discard;
1712		dst1->output = inner_mode->afinfo->output;
1713
1714		dst1->next = dst_prev;
1715		dst_prev = dst1;
1716
1717		header_len += xfrm[i]->props.header_len;
1718		if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
1719			nfheader_len += xfrm[i]->props.header_len;
1720		trailer_len += xfrm[i]->props.trailer_len;
1721	}
1722
1723	dst_prev->child = dst;
1724	dst0->path = dst;
1725
1726	err = -ENODEV;
1727	dev = dst->dev;
1728	if (!dev)
1729		goto free_dst;
1730
1731	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
1732	xfrm_init_pmtu(dst_prev);
1733
1734	for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
1735		struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
1736
1737		err = xfrm_fill_dst(xdst, dev, fl);
1738		if (err)
1739			goto free_dst;
1740
1741		dst_prev->header_len = header_len;
1742		dst_prev->trailer_len = trailer_len;
1743		header_len -= xdst->u.dst.xfrm->props.header_len;
1744		trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
1745	}
1746
1747out:
1748	return dst0;
1749
1750put_states:
1751	for (; i < nx; i++)
1752		xfrm_state_put(xfrm[i]);
1753free_dst:
1754	if (dst0)
1755		dst_free(dst0);
1756	dst0 = ERR_PTR(err);
1757	goto out;
1758}
1759
1760#ifdef CONFIG_XFRM_SUB_POLICY
1761static int xfrm_dst_alloc_copy(void **target, const void *src, int size)
1762{
1763	if (!*target) {
1764		*target = kmalloc(size, GFP_ATOMIC);
1765		if (!*target)
1766			return -ENOMEM;
1767	}
1768
1769	memcpy(*target, src, size);
1770	return 0;
1771}
1772#endif
1773
1774static int xfrm_dst_update_parent(struct dst_entry *dst,
1775				  const struct xfrm_selector *sel)
1776{
1777#ifdef CONFIG_XFRM_SUB_POLICY
1778	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1779	return xfrm_dst_alloc_copy((void **)&(xdst->partner),
1780				   sel, sizeof(*sel));
1781#else
1782	return 0;
1783#endif
1784}
1785
1786static int xfrm_dst_update_origin(struct dst_entry *dst,
1787				  const struct flowi *fl)
1788{
1789#ifdef CONFIG_XFRM_SUB_POLICY
1790	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1791	return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
1792#else
1793	return 0;
1794#endif
1795}
1796
1797static int xfrm_expand_policies(const struct flowi *fl, u16 family,
1798				struct xfrm_policy **pols,
1799				int *num_pols, int *num_xfrms)
1800{
1801	int i;
1802
1803	if (*num_pols == 0 || !pols[0]) {
1804		*num_pols = 0;
1805		*num_xfrms = 0;
1806		return 0;
1807	}
1808	if (IS_ERR(pols[0]))
1809		return PTR_ERR(pols[0]);
1810
1811	*num_xfrms = pols[0]->xfrm_nr;
1812
1813#ifdef CONFIG_XFRM_SUB_POLICY
1814	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
1815	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1816		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
1817						    XFRM_POLICY_TYPE_MAIN,
1818						    fl, family,
1819						    XFRM_POLICY_OUT);
1820		if (pols[1]) {
1821			if (IS_ERR(pols[1])) {
1822				xfrm_pols_put(pols, *num_pols);
1823				return PTR_ERR(pols[1]);
1824			}
1825			(*num_pols)++;
1826			(*num_xfrms) += pols[1]->xfrm_nr;
1827		}
1828	}
1829#endif
1830	for (i = 0; i < *num_pols; i++) {
1831		if (pols[i]->action != XFRM_POLICY_ALLOW) {
1832			*num_xfrms = -1;
1833			break;
1834		}
1835	}
1836
1837	return 0;
1838
1839}
1840
1841static struct xfrm_dst *
1842xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
1843			       const struct flowi *fl, u16 family,
1844			       struct dst_entry *dst_orig)
1845{
1846	struct net *net = xp_net(pols[0]);
1847	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
1848	struct dst_entry *dst;
1849	struct xfrm_dst *xdst;
1850	int err;
1851
1852	/* Try to instantiate a bundle */
1853	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
1854	if (err <= 0) {
1855		if (err != 0 && err != -EAGAIN)
1856			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
1857		return ERR_PTR(err);
1858	}
1859
1860	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
1861	if (IS_ERR(dst)) {
1862		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
1863		return ERR_CAST(dst);
1864	}
1865
1866	xdst = (struct xfrm_dst *)dst;
1867	xdst->num_xfrms = err;
1868	if (num_pols > 1)
1869		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
1870	else
1871		err = xfrm_dst_update_origin(dst, fl);
1872	if (unlikely(err)) {
1873		dst_free(dst);
1874		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
1875		return ERR_PTR(err);
1876	}
1877
1878	xdst->num_pols = num_pols;
1879	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
1880	xdst->policy_genid = atomic_read(&pols[0]->genid);
1881
1882	return xdst;
1883}
1884
1885static void xfrm_policy_queue_process(unsigned long arg)
1886{
1887	struct sk_buff *skb;
1888	struct sock *sk;
1889	struct dst_entry *dst;
1890	struct xfrm_policy *pol = (struct xfrm_policy *)arg;
1891	struct xfrm_policy_queue *pq = &pol->polq;
1892	struct flowi fl;
1893	struct sk_buff_head list;
1894
1895	spin_lock(&pq->hold_queue.lock);
1896	skb = skb_peek(&pq->hold_queue);
1897	if (!skb) {
1898		spin_unlock(&pq->hold_queue.lock);
1899		goto out;
1900	}
1901	dst = skb_dst(skb);
1902	sk = skb->sk;
1903	xfrm_decode_session(skb, &fl, dst->ops->family);
1904	spin_unlock(&pq->hold_queue.lock);
1905
1906	dst_hold(dst->path);
1907	dst = xfrm_lookup(xp_net(pol), dst->path, &fl,
1908			  sk, 0);
1909	if (IS_ERR(dst))
1910		goto purge_queue;
1911
1912	if (dst->flags & DST_XFRM_QUEUE) {
1913		dst_release(dst);
1914
1915		if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
1916			goto purge_queue;
1917
1918		pq->timeout = pq->timeout << 1;
1919		if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
1920			xfrm_pol_hold(pol);
1921	goto out;
1922	}
1923
1924	dst_release(dst);
1925
1926	__skb_queue_head_init(&list);
1927
1928	spin_lock(&pq->hold_queue.lock);
1929	pq->timeout = 0;
1930	skb_queue_splice_init(&pq->hold_queue, &list);
1931	spin_unlock(&pq->hold_queue.lock);
1932
1933	while (!skb_queue_empty(&list)) {
1934		skb = __skb_dequeue(&list);
1935
1936		xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
1937		dst_hold(skb_dst(skb)->path);
1938		dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path,
1939				  &fl, skb->sk, 0);
1940		if (IS_ERR(dst)) {
1941			kfree_skb(skb);
1942			continue;
1943		}
1944
1945		nf_reset(skb);
1946		skb_dst_drop(skb);
1947		skb_dst_set(skb, dst);
1948
1949		dst_output(skb);
1950	}
1951
1952out:
1953	xfrm_pol_put(pol);
1954	return;
1955
1956purge_queue:
1957	pq->timeout = 0;
1958	xfrm_queue_purge(&pq->hold_queue);
1959	xfrm_pol_put(pol);
1960}
1961
1962static int xdst_queue_output(struct sock *sk, struct sk_buff *skb)
1963{
1964	unsigned long sched_next;
1965	struct dst_entry *dst = skb_dst(skb);
1966	struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
1967	struct xfrm_policy *pol = xdst->pols[0];
1968	struct xfrm_policy_queue *pq = &pol->polq;
1969
1970	if (unlikely(skb_fclone_busy(sk, skb))) {
1971		kfree_skb(skb);
1972		return 0;
1973	}
1974
1975	if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
1976		kfree_skb(skb);
1977		return -EAGAIN;
1978	}
1979
1980	skb_dst_force(skb);
1981
1982	spin_lock_bh(&pq->hold_queue.lock);
1983
1984	if (!pq->timeout)
1985		pq->timeout = XFRM_QUEUE_TMO_MIN;
1986
1987	sched_next = jiffies + pq->timeout;
1988
1989	if (del_timer(&pq->hold_timer)) {
1990		if (time_before(pq->hold_timer.expires, sched_next))
1991			sched_next = pq->hold_timer.expires;
1992		xfrm_pol_put(pol);
1993	}
1994
1995	__skb_queue_tail(&pq->hold_queue, skb);
1996	if (!mod_timer(&pq->hold_timer, sched_next))
1997		xfrm_pol_hold(pol);
1998
1999	spin_unlock_bh(&pq->hold_queue.lock);
2000
2001	return 0;
2002}
2003
2004static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
2005						 struct xfrm_flo *xflo,
2006						 const struct flowi *fl,
2007						 int num_xfrms,
2008						 u16 family)
2009{
2010	int err;
2011	struct net_device *dev;
2012	struct dst_entry *dst;
2013	struct dst_entry *dst1;
2014	struct xfrm_dst *xdst;
2015
2016	xdst = xfrm_alloc_dst(net, family);
2017	if (IS_ERR(xdst))
2018		return xdst;
2019
2020	if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
2021	    net->xfrm.sysctl_larval_drop ||
2022	    num_xfrms <= 0)
2023		return xdst;
2024
2025	dst = xflo->dst_orig;
2026	dst1 = &xdst->u.dst;
2027	dst_hold(dst);
2028	xdst->route = dst;
2029
2030	dst_copy_metrics(dst1, dst);
2031
2032	dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
2033	dst1->flags |= DST_HOST | DST_XFRM_QUEUE;
2034	dst1->lastuse = jiffies;
2035
2036	dst1->input = dst_discard;
2037	dst1->output = xdst_queue_output;
2038
2039	dst_hold(dst);
2040	dst1->child = dst;
2041	dst1->path = dst;
2042
2043	xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
2044
2045	err = -ENODEV;
2046	dev = dst->dev;
2047	if (!dev)
2048		goto free_dst;
2049
2050	err = xfrm_fill_dst(xdst, dev, fl);
2051	if (err)
2052		goto free_dst;
2053
2054out:
2055	return xdst;
2056
2057free_dst:
2058	dst_release(dst1);
2059	xdst = ERR_PTR(err);
2060	goto out;
2061}
2062
2063static struct flow_cache_object *
2064xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
2065		   struct flow_cache_object *oldflo, void *ctx)
2066{
2067	struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
2068	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2069	struct xfrm_dst *xdst, *new_xdst;
2070	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
2071
2072	/* Check if the policies from old bundle are usable */
2073	xdst = NULL;
2074	if (oldflo) {
2075		xdst = container_of(oldflo, struct xfrm_dst, flo);
2076		num_pols = xdst->num_pols;
2077		num_xfrms = xdst->num_xfrms;
2078		pol_dead = 0;
2079		for (i = 0; i < num_pols; i++) {
2080			pols[i] = xdst->pols[i];
2081			pol_dead |= pols[i]->walk.dead;
2082		}
2083		if (pol_dead) {
2084			dst_free(&xdst->u.dst);
2085			xdst = NULL;
2086			num_pols = 0;
2087			num_xfrms = 0;
2088			oldflo = NULL;
2089		}
2090	}
2091
2092	/* Resolve policies to use if we couldn't get them from
2093	 * previous cache entry */
2094	if (xdst == NULL) {
2095		num_pols = 1;
2096		pols[0] = __xfrm_policy_lookup(net, fl, family,
2097					       flow_to_policy_dir(dir));
2098		err = xfrm_expand_policies(fl, family, pols,
2099					   &num_pols, &num_xfrms);
2100		if (err < 0)
2101			goto inc_error;
2102		if (num_pols == 0)
2103			return NULL;
2104		if (num_xfrms <= 0)
2105			goto make_dummy_bundle;
2106	}
2107
2108	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
2109						  xflo->dst_orig);
2110	if (IS_ERR(new_xdst)) {
2111		err = PTR_ERR(new_xdst);
2112		if (err != -EAGAIN)
2113			goto error;
2114		if (oldflo == NULL)
2115			goto make_dummy_bundle;
2116		dst_hold(&xdst->u.dst);
2117		return oldflo;
2118	} else if (new_xdst == NULL) {
2119		num_xfrms = 0;
2120		if (oldflo == NULL)
2121			goto make_dummy_bundle;
2122		xdst->num_xfrms = 0;
2123		dst_hold(&xdst->u.dst);
2124		return oldflo;
2125	}
2126
2127	/* Kill the previous bundle */
2128	if (xdst) {
2129		/* The policies were stolen for newly generated bundle */
2130		xdst->num_pols = 0;
2131		dst_free(&xdst->u.dst);
2132	}
2133
2134	/* Flow cache does not have reference, it dst_free()'s,
2135	 * but we do need to return one reference for original caller */
2136	dst_hold(&new_xdst->u.dst);
2137	return &new_xdst->flo;
2138
2139make_dummy_bundle:
2140	/* We found policies, but there's no bundles to instantiate:
2141	 * either because the policy blocks, has no transformations or
2142	 * we could not build template (no xfrm_states).*/
2143	xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
2144	if (IS_ERR(xdst)) {
2145		xfrm_pols_put(pols, num_pols);
2146		return ERR_CAST(xdst);
2147	}
2148	xdst->num_pols = num_pols;
2149	xdst->num_xfrms = num_xfrms;
2150	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
2151
2152	dst_hold(&xdst->u.dst);
2153	return &xdst->flo;
2154
2155inc_error:
2156	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
2157error:
2158	if (xdst != NULL)
2159		dst_free(&xdst->u.dst);
2160	else
2161		xfrm_pols_put(pols, num_pols);
2162	return ERR_PTR(err);
2163}
2164
2165static struct dst_entry *make_blackhole(struct net *net, u16 family,
2166					struct dst_entry *dst_orig)
2167{
2168	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2169	struct dst_entry *ret;
2170
2171	if (!afinfo) {
2172		dst_release(dst_orig);
2173		return ERR_PTR(-EINVAL);
2174	} else {
2175		ret = afinfo->blackhole_route(net, dst_orig);
2176	}
2177	xfrm_policy_put_afinfo(afinfo);
2178
2179	return ret;
2180}
2181
2182/* Main function: finds/creates a bundle for given flow.
2183 *
2184 * At the moment we eat a raw IP route. Mostly to speed up lookups
2185 * on interfaces with disabled IPsec.
2186 */
2187struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2188			      const struct flowi *fl,
2189			      struct sock *sk, int flags)
2190{
2191	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2192	struct flow_cache_object *flo;
2193	struct xfrm_dst *xdst;
2194	struct dst_entry *dst, *route;
2195	u16 family = dst_orig->ops->family;
2196	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
2197	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
2198
2199	dst = NULL;
2200	xdst = NULL;
2201	route = NULL;
2202
2203	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
2204		num_pols = 1;
2205		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
2206		err = xfrm_expand_policies(fl, family, pols,
2207					   &num_pols, &num_xfrms);
2208		if (err < 0)
2209			goto dropdst;
2210
2211		if (num_pols) {
2212			if (num_xfrms <= 0) {
2213				drop_pols = num_pols;
2214				goto no_transform;
2215			}
2216
2217			xdst = xfrm_resolve_and_create_bundle(
2218					pols, num_pols, fl,
2219					family, dst_orig);
2220			if (IS_ERR(xdst)) {
2221				xfrm_pols_put(pols, num_pols);
2222				err = PTR_ERR(xdst);
2223				goto dropdst;
2224			} else if (xdst == NULL) {
2225				num_xfrms = 0;
2226				drop_pols = num_pols;
2227				goto no_transform;
2228			}
2229
2230			dst_hold(&xdst->u.dst);
2231			xdst->u.dst.flags |= DST_NOCACHE;
2232			route = xdst->route;
2233		}
2234	}
2235
2236	if (xdst == NULL) {
2237		struct xfrm_flo xflo;
2238
2239		xflo.dst_orig = dst_orig;
2240		xflo.flags = flags;
2241
2242		/* To accelerate a bit...  */
2243		if ((dst_orig->flags & DST_NOXFRM) ||
2244		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
2245			goto nopol;
2246
2247		flo = flow_cache_lookup(net, fl, family, dir,
2248					xfrm_bundle_lookup, &xflo);
2249		if (flo == NULL)
2250			goto nopol;
2251		if (IS_ERR(flo)) {
2252			err = PTR_ERR(flo);
2253			goto dropdst;
2254		}
2255		xdst = container_of(flo, struct xfrm_dst, flo);
2256
2257		num_pols = xdst->num_pols;
2258		num_xfrms = xdst->num_xfrms;
2259		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
2260		route = xdst->route;
2261	}
2262
2263	dst = &xdst->u.dst;
2264	if (route == NULL && num_xfrms > 0) {
2265		/* The only case when xfrm_bundle_lookup() returns a
2266		 * bundle with null route, is when the template could
2267		 * not be resolved. It means policies are there, but
2268		 * bundle could not be created, since we don't yet
2269		 * have the xfrm_state's. We need to wait for KM to
2270		 * negotiate new SA's or bail out with error.*/
2271		if (net->xfrm.sysctl_larval_drop) {
2272			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
2273			err = -EREMOTE;
2274			goto error;
2275		}
2276
2277		err = -EAGAIN;
2278
2279		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
2280		goto error;
2281	}
2282
2283no_transform:
2284	if (num_pols == 0)
2285		goto nopol;
2286
2287	if ((flags & XFRM_LOOKUP_ICMP) &&
2288	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
2289		err = -ENOENT;
2290		goto error;
2291	}
2292
2293	for (i = 0; i < num_pols; i++)
2294		pols[i]->curlft.use_time = get_seconds();
2295
2296	if (num_xfrms < 0) {
2297		/* Prohibit the flow */
2298		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
2299		err = -EPERM;
2300		goto error;
2301	} else if (num_xfrms > 0) {
2302		/* Flow transformed */
2303		dst_release(dst_orig);
2304	} else {
2305		/* Flow passes untransformed */
2306		dst_release(dst);
2307		dst = dst_orig;
2308	}
2309ok:
2310	xfrm_pols_put(pols, drop_pols);
2311	if (dst && dst->xfrm &&
2312	    dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
2313		dst->flags |= DST_XFRM_TUNNEL;
2314	return dst;
2315
2316nopol:
2317	if (!(flags & XFRM_LOOKUP_ICMP)) {
2318		dst = dst_orig;
2319		goto ok;
2320	}
2321	err = -ENOENT;
2322error:
2323	dst_release(dst);
2324dropdst:
2325	if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
2326		dst_release(dst_orig);
2327	xfrm_pols_put(pols, drop_pols);
2328	return ERR_PTR(err);
2329}
2330EXPORT_SYMBOL(xfrm_lookup);
2331
2332/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
2333 * Otherwise we may send out blackholed packets.
2334 */
2335struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
2336				    const struct flowi *fl,
2337				    struct sock *sk, int flags)
2338{
2339	struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
2340					    flags | XFRM_LOOKUP_QUEUE |
2341					    XFRM_LOOKUP_KEEP_DST_REF);
2342
2343	if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
2344		return make_blackhole(net, dst_orig->ops->family, dst_orig);
2345
2346	return dst;
2347}
2348EXPORT_SYMBOL(xfrm_lookup_route);
2349
2350static inline int
2351xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
2352{
2353	struct xfrm_state *x;
2354
2355	if (!skb->sp || idx < 0 || idx >= skb->sp->len)
2356		return 0;
2357	x = skb->sp->xvec[idx];
2358	if (!x->type->reject)
2359		return 0;
2360	return x->type->reject(x, skb, fl);
2361}
2362
2363/* When skb is transformed back to its "native" form, we have to
2364 * check policy restrictions. At the moment we make this in maximally
2365 * stupid way. Shame on me. :-) Of course, connected sockets must
2366 * have policy cached at them.
2367 */
2368
2369static inline int
2370xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
2371	      unsigned short family)
2372{
2373	if (xfrm_state_kern(x))
2374		return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
2375	return	x->id.proto == tmpl->id.proto &&
2376		(x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
2377		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
2378		x->props.mode == tmpl->mode &&
2379		(tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
2380		 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
2381		!(x->props.mode != XFRM_MODE_TRANSPORT &&
2382		  xfrm_state_addr_cmp(tmpl, x, family));
2383}
2384
2385/*
2386 * 0 or more than 0 is returned when validation is succeeded (either bypass
2387 * because of optional transport mode, or next index of the mathced secpath
2388 * state with the template.
2389 * -1 is returned when no matching template is found.
2390 * Otherwise "-2 - errored_index" is returned.
2391 */
2392static inline int
2393xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
2394	       unsigned short family)
2395{
2396	int idx = start;
2397
2398	if (tmpl->optional) {
2399		if (tmpl->mode == XFRM_MODE_TRANSPORT)
2400			return start;
2401	} else
2402		start = -1;
2403	for (; idx < sp->len; idx++) {
2404		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
2405			return ++idx;
2406		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
2407			if (start == -1)
2408				start = -2-idx;
2409			break;
2410		}
2411	}
2412	return start;
2413}
2414
2415int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2416			  unsigned int family, int reverse)
2417{
2418	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2419	int err;
2420
2421	if (unlikely(afinfo == NULL))
2422		return -EAFNOSUPPORT;
2423
2424	afinfo->decode_session(skb, fl, reverse);
2425	err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2426	xfrm_policy_put_afinfo(afinfo);
2427	return err;
2428}
2429EXPORT_SYMBOL(__xfrm_decode_session);
2430
2431static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
2432{
2433	for (; k < sp->len; k++) {
2434		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
2435			*idxp = k;
2436			return 1;
2437		}
2438	}
2439
2440	return 0;
2441}
2442
2443int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2444			unsigned short family)
2445{
2446	struct net *net = dev_net(skb->dev);
2447	struct xfrm_policy *pol;
2448	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2449	int npols = 0;
2450	int xfrm_nr;
2451	int pi;
2452	int reverse;
2453	struct flowi fl;
2454	u8 fl_dir;
2455	int xerr_idx = -1;
2456
2457	reverse = dir & ~XFRM_POLICY_MASK;
2458	dir &= XFRM_POLICY_MASK;
2459	fl_dir = policy_to_flow_dir(dir);
2460
2461	if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
2462		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
2463		return 0;
2464	}
2465
2466	nf_nat_decode_session(skb, &fl, family);
2467
2468	/* First, check used SA against their selectors. */
2469	if (skb->sp) {
2470		int i;
2471
2472		for (i = skb->sp->len-1; i >= 0; i--) {
2473			struct xfrm_state *x = skb->sp->xvec[i];
2474			if (!xfrm_selector_match(&x->sel, &fl, family)) {
2475				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
2476				return 0;
2477			}
2478		}
2479	}
2480
2481	pol = NULL;
2482	if (sk && sk->sk_policy[dir]) {
2483		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
2484		if (IS_ERR(pol)) {
2485			XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2486			return 0;
2487		}
2488	}
2489
2490	if (!pol) {
2491		struct flow_cache_object *flo;
2492
2493		flo = flow_cache_lookup(net, &fl, family, fl_dir,
2494					xfrm_policy_lookup, NULL);
2495		if (IS_ERR_OR_NULL(flo))
2496			pol = ERR_CAST(flo);
2497		else
2498			pol = container_of(flo, struct xfrm_policy, flo);
2499	}
2500
2501	if (IS_ERR(pol)) {
2502		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2503		return 0;
2504	}
2505
2506	if (!pol) {
2507		if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
2508			xfrm_secpath_reject(xerr_idx, skb, &fl);
2509			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
2510			return 0;
2511		}
2512		return 1;
2513	}
2514
2515	pol->curlft.use_time = get_seconds();
2516
2517	pols[0] = pol;
2518	npols++;
2519#ifdef CONFIG_XFRM_SUB_POLICY
2520	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
2521		pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
2522						    &fl, family,
2523						    XFRM_POLICY_IN);
2524		if (pols[1]) {
2525			if (IS_ERR(pols[1])) {
2526				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2527				return 0;
2528			}
2529			pols[1]->curlft.use_time = get_seconds();
2530			npols++;
2531		}
2532	}
2533#endif
2534
2535	if (pol->action == XFRM_POLICY_ALLOW) {
2536		struct sec_path *sp;
2537		static struct sec_path dummy;
2538		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
2539		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
2540		struct xfrm_tmpl **tpp = tp;
2541		int ti = 0;
2542		int i, k;
2543
2544		if ((sp = skb->sp) == NULL)
2545			sp = &dummy;
2546
2547		for (pi = 0; pi < npols; pi++) {
2548			if (pols[pi] != pol &&
2549			    pols[pi]->action != XFRM_POLICY_ALLOW) {
2550				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
2551				goto reject;
2552			}
2553			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
2554				XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
2555				goto reject_error;
2556			}
2557			for (i = 0; i < pols[pi]->xfrm_nr; i++)
2558				tpp[ti++] = &pols[pi]->xfrm_vec[i];
2559		}
2560		xfrm_nr = ti;
2561		if (npols > 1) {
2562			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
2563			tpp = stp;
2564		}
2565
2566		/* For each tunnel xfrm, find the first matching tmpl.
2567		 * For each tmpl before that, find corresponding xfrm.
2568		 * Order is _important_. Later we will implement
2569		 * some barriers, but at the moment barriers
2570		 * are implied between each two transformations.
2571		 */
2572		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
2573			k = xfrm_policy_ok(tpp[i], sp, k, family);
2574			if (k < 0) {
2575				if (k < -1)
2576					/* "-2 - errored_index" returned */
2577					xerr_idx = -(2+k);
2578				XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
2579				goto reject;
2580			}
2581		}
2582
2583		if (secpath_has_nontransport(sp, k, &xerr_idx)) {
2584			XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
2585			goto reject;
2586		}
2587
2588		xfrm_pols_put(pols, npols);
2589		return 1;
2590	}
2591	XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
2592
2593reject:
2594	xfrm_secpath_reject(xerr_idx, skb, &fl);
2595reject_error:
2596	xfrm_pols_put(pols, npols);
2597	return 0;
2598}
2599EXPORT_SYMBOL(__xfrm_policy_check);
2600
2601int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
2602{
2603	struct net *net = dev_net(skb->dev);
2604	struct flowi fl;
2605	struct dst_entry *dst;
2606	int res = 1;
2607
2608	if (xfrm_decode_session(skb, &fl, family) < 0) {
2609		XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
2610		return 0;
2611	}
2612
2613	skb_dst_force(skb);
2614
2615	dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
2616	if (IS_ERR(dst)) {
2617		res = 0;
2618		dst = NULL;
2619	}
2620	skb_dst_set(skb, dst);
2621	return res;
2622}
2623EXPORT_SYMBOL(__xfrm_route_forward);
2624
2625/* Optimize later using cookies and generation ids. */
2626
2627static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
2628{
2629	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
2630	 * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
2631	 * get validated by dst_ops->check on every use.  We do this
2632	 * because when a normal route referenced by an XFRM dst is
2633	 * obsoleted we do not go looking around for all parent
2634	 * referencing XFRM dsts so that we can invalidate them.  It
2635	 * is just too much work.  Instead we make the checks here on
2636	 * every use.  For example:
2637	 *
2638	 *	XFRM dst A --> IPv4 dst X
2639	 *
2640	 * X is the "xdst->route" of A (X is also the "dst->path" of A
2641	 * in this example).  If X is marked obsolete, "A" will not
2642	 * notice.  That's what we are validating here via the
2643	 * stale_bundle() check.
2644	 *
2645	 * When a policy's bundle is pruned, we dst_free() the XFRM
2646	 * dst which causes it's ->obsolete field to be set to
2647	 * DST_OBSOLETE_DEAD.  If an XFRM dst has been pruned like
2648	 * this, we want to force a new route lookup.
2649	 */
2650	if (dst->obsolete < 0 && !stale_bundle(dst))
2651		return dst;
2652
2653	return NULL;
2654}
2655
2656static int stale_bundle(struct dst_entry *dst)
2657{
2658	return !xfrm_bundle_ok((struct xfrm_dst *)dst);
2659}
2660
2661void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
2662{
2663	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
2664		dst->dev = dev_net(dev)->loopback_dev;
2665		dev_hold(dst->dev);
2666		dev_put(dev);
2667	}
2668}
2669EXPORT_SYMBOL(xfrm_dst_ifdown);
2670
2671static void xfrm_link_failure(struct sk_buff *skb)
2672{
2673	/* Impossible. Such dst must be popped before reaches point of failure. */
2674}
2675
2676static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
2677{
2678	if (dst) {
2679		if (dst->obsolete) {
2680			dst_release(dst);
2681			dst = NULL;
2682		}
2683	}
2684	return dst;
2685}
2686
2687void xfrm_garbage_collect(struct net *net)
2688{
2689	flow_cache_flush(net);
2690}
2691EXPORT_SYMBOL(xfrm_garbage_collect);
2692
2693static void xfrm_garbage_collect_deferred(struct net *net)
2694{
2695	flow_cache_flush_deferred(net);
2696}
2697
2698static void xfrm_init_pmtu(struct dst_entry *dst)
2699{
2700	do {
2701		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
2702		u32 pmtu, route_mtu_cached;
2703
2704		pmtu = dst_mtu(dst->child);
2705		xdst->child_mtu_cached = pmtu;
2706
2707		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
2708
2709		route_mtu_cached = dst_mtu(xdst->route);
2710		xdst->route_mtu_cached = route_mtu_cached;
2711
2712		if (pmtu > route_mtu_cached)
2713			pmtu = route_mtu_cached;
2714
2715		dst_metric_set(dst, RTAX_MTU, pmtu);
2716	} while ((dst = dst->next));
2717}
2718
2719/* Check that the bundle accepts the flow and its components are
2720 * still valid.
2721 */
2722
2723static int xfrm_bundle_ok(struct xfrm_dst *first)
2724{
2725	struct dst_entry *dst = &first->u.dst;
2726	struct xfrm_dst *last;
2727	u32 mtu;
2728
2729	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
2730	    (dst->dev && !netif_running(dst->dev)))
2731		return 0;
2732
2733	if (dst->flags & DST_XFRM_QUEUE)
2734		return 1;
2735
2736	last = NULL;
2737
2738	do {
2739		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
2740
2741		if (dst->xfrm->km.state != XFRM_STATE_VALID)
2742			return 0;
2743		if (xdst->xfrm_genid != dst->xfrm->genid)
2744			return 0;
2745		if (xdst->num_pols > 0 &&
2746		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
2747			return 0;
2748
2749		mtu = dst_mtu(dst->child);
2750		if (xdst->child_mtu_cached != mtu) {
2751			last = xdst;
2752			xdst->child_mtu_cached = mtu;
2753		}
2754
2755		if (!dst_check(xdst->route, xdst->route_cookie))
2756			return 0;
2757		mtu = dst_mtu(xdst->route);
2758		if (xdst->route_mtu_cached != mtu) {
2759			last = xdst;
2760			xdst->route_mtu_cached = mtu;
2761		}
2762
2763		dst = dst->child;
2764	} while (dst->xfrm);
2765
2766	if (likely(!last))
2767		return 1;
2768
2769	mtu = last->child_mtu_cached;
2770	for (;;) {
2771		dst = &last->u.dst;
2772
2773		mtu = xfrm_state_mtu(dst->xfrm, mtu);
2774		if (mtu > last->route_mtu_cached)
2775			mtu = last->route_mtu_cached;
2776		dst_metric_set(dst, RTAX_MTU, mtu);
2777
2778		if (last == first)
2779			break;
2780
2781		last = (struct xfrm_dst *)last->u.dst.next;
2782		last->child_mtu_cached = mtu;
2783	}
2784
2785	return 1;
2786}
2787
2788static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
2789{
2790	return dst_metric_advmss(dst->path);
2791}
2792
2793static unsigned int xfrm_mtu(const struct dst_entry *dst)
2794{
2795	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2796
2797	return mtu ? : dst_mtu(dst->path);
2798}
2799
2800static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
2801					   struct sk_buff *skb,
2802					   const void *daddr)
2803{
2804	return dst->path->ops->neigh_lookup(dst, skb, daddr);
2805}
2806
2807int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2808{
2809	int err = 0;
2810	if (unlikely(afinfo == NULL))
2811		return -EINVAL;
2812	if (unlikely(afinfo->family >= NPROTO))
2813		return -EAFNOSUPPORT;
2814	spin_lock(&xfrm_policy_afinfo_lock);
2815	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
2816		err = -ENOBUFS;
2817	else {
2818		struct dst_ops *dst_ops = afinfo->dst_ops;
2819		if (likely(dst_ops->kmem_cachep == NULL))
2820			dst_ops->kmem_cachep = xfrm_dst_cache;
2821		if (likely(dst_ops->check == NULL))
2822			dst_ops->check = xfrm_dst_check;
2823		if (likely(dst_ops->default_advmss == NULL))
2824			dst_ops->default_advmss = xfrm_default_advmss;
2825		if (likely(dst_ops->mtu == NULL))
2826			dst_ops->mtu = xfrm_mtu;
2827		if (likely(dst_ops->negative_advice == NULL))
2828			dst_ops->negative_advice = xfrm_negative_advice;
2829		if (likely(dst_ops->link_failure == NULL))
2830			dst_ops->link_failure = xfrm_link_failure;
2831		if (likely(dst_ops->neigh_lookup == NULL))
2832			dst_ops->neigh_lookup = xfrm_neigh_lookup;
2833		if (likely(afinfo->garbage_collect == NULL))
2834			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
2835		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
2836	}
2837	spin_unlock(&xfrm_policy_afinfo_lock);
2838
2839	return err;
2840}
2841EXPORT_SYMBOL(xfrm_policy_register_afinfo);
2842
2843int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
2844{
2845	int err = 0;
2846	if (unlikely(afinfo == NULL))
2847		return -EINVAL;
2848	if (unlikely(afinfo->family >= NPROTO))
2849		return -EAFNOSUPPORT;
2850	spin_lock(&xfrm_policy_afinfo_lock);
2851	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
2852		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
2853			err = -EINVAL;
2854		else
2855			RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
2856					 NULL);
2857	}
2858	spin_unlock(&xfrm_policy_afinfo_lock);
2859	if (!err) {
2860		struct dst_ops *dst_ops = afinfo->dst_ops;
2861
2862		synchronize_rcu();
2863
2864		dst_ops->kmem_cachep = NULL;
2865		dst_ops->check = NULL;
2866		dst_ops->negative_advice = NULL;
2867		dst_ops->link_failure = NULL;
2868		afinfo->garbage_collect = NULL;
2869	}
2870	return err;
2871}
2872EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2873
2874static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
2875{
2876	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2877
2878	switch (event) {
2879	case NETDEV_DOWN:
2880		xfrm_garbage_collect(dev_net(dev));
2881	}
2882	return NOTIFY_DONE;
2883}
2884
2885static struct notifier_block xfrm_dev_notifier = {
2886	.notifier_call	= xfrm_dev_event,
2887};
2888
2889#ifdef CONFIG_XFRM_STATISTICS
2890static int __net_init xfrm_statistics_init(struct net *net)
2891{
2892	int rv;
2893	net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
2894	if (!net->mib.xfrm_statistics)
2895		return -ENOMEM;
2896	rv = xfrm_proc_init(net);
2897	if (rv < 0)
2898		free_percpu(net->mib.xfrm_statistics);
2899	return rv;
2900}
2901
2902static void xfrm_statistics_fini(struct net *net)
2903{
2904	xfrm_proc_fini(net);
2905	free_percpu(net->mib.xfrm_statistics);
2906}
2907#else
2908static int __net_init xfrm_statistics_init(struct net *net)
2909{
2910	return 0;
2911}
2912
2913static void xfrm_statistics_fini(struct net *net)
2914{
2915}
2916#endif
2917
2918static int __net_init xfrm_policy_init(struct net *net)
2919{
2920	unsigned int hmask, sz;
2921	int dir;
2922
2923	if (net_eq(net, &init_net))
2924		xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
2925					   sizeof(struct xfrm_dst),
2926					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2927					   NULL);
2928
2929	hmask = 8 - 1;
2930	sz = (hmask+1) * sizeof(struct hlist_head);
2931
2932	net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
2933	if (!net->xfrm.policy_byidx)
2934		goto out_byidx;
2935	net->xfrm.policy_idx_hmask = hmask;
2936
2937	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
2938		struct xfrm_policy_hash *htab;
2939
2940		net->xfrm.policy_count[dir] = 0;
2941		net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
2942		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
2943
2944		htab = &net->xfrm.policy_bydst[dir];
2945		htab->table = xfrm_hash_alloc(sz);
2946		if (!htab->table)
2947			goto out_bydst;
2948		htab->hmask = hmask;
2949		htab->dbits4 = 32;
2950		htab->sbits4 = 32;
2951		htab->dbits6 = 128;
2952		htab->sbits6 = 128;
2953	}
2954	net->xfrm.policy_hthresh.lbits4 = 32;
2955	net->xfrm.policy_hthresh.rbits4 = 32;
2956	net->xfrm.policy_hthresh.lbits6 = 128;
2957	net->xfrm.policy_hthresh.rbits6 = 128;
2958
2959	seqlock_init(&net->xfrm.policy_hthresh.lock);
2960
2961	INIT_LIST_HEAD(&net->xfrm.policy_all);
2962	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
2963	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
2964	if (net_eq(net, &init_net))
2965		register_netdevice_notifier(&xfrm_dev_notifier);
2966	return 0;
2967
2968out_bydst:
2969	for (dir--; dir >= 0; dir--) {
2970		struct xfrm_policy_hash *htab;
2971
2972		htab = &net->xfrm.policy_bydst[dir];
2973		xfrm_hash_free(htab->table, sz);
2974	}
2975	xfrm_hash_free(net->xfrm.policy_byidx, sz);
2976out_byidx:
2977	return -ENOMEM;
2978}
2979
2980static void xfrm_policy_fini(struct net *net)
2981{
2982	unsigned int sz;
2983	int dir;
2984
2985	flush_work(&net->xfrm.policy_hash_work);
2986#ifdef CONFIG_XFRM_SUB_POLICY
2987	xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
2988#endif
2989	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);
2990
2991	WARN_ON(!list_empty(&net->xfrm.policy_all));
2992
2993	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
2994		struct xfrm_policy_hash *htab;
2995
2996		WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
2997
2998		htab = &net->xfrm.policy_bydst[dir];
2999		sz = (htab->hmask + 1) * sizeof(struct hlist_head);
3000		WARN_ON(!hlist_empty(htab->table));
3001		xfrm_hash_free(htab->table, sz);
3002	}
3003
3004	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
3005	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
3006	xfrm_hash_free(net->xfrm.policy_byidx, sz);
3007}
3008
3009static int __net_init xfrm_net_init(struct net *net)
3010{
3011	int rv;
3012
3013	rv = xfrm_statistics_init(net);
3014	if (rv < 0)
3015		goto out_statistics;
3016	rv = xfrm_state_init(net);
3017	if (rv < 0)
3018		goto out_state;
3019	rv = xfrm_policy_init(net);
3020	if (rv < 0)
3021		goto out_policy;
3022	rv = xfrm_sysctl_init(net);
3023	if (rv < 0)
3024		goto out_sysctl;
3025	rv = flow_cache_init(net);
3026	if (rv < 0)
3027		goto out;
3028
3029	/* Initialize the per-net locks here */
3030	spin_lock_init(&net->xfrm.xfrm_state_lock);
3031	rwlock_init(&net->xfrm.xfrm_policy_lock);
3032	mutex_init(&net->xfrm.xfrm_cfg_mutex);
3033
3034	return 0;
3035
3036out:
3037	xfrm_sysctl_fini(net);
3038out_sysctl:
3039	xfrm_policy_fini(net);
3040out_policy:
3041	xfrm_state_fini(net);
3042out_state:
3043	xfrm_statistics_fini(net);
3044out_statistics:
3045	return rv;
3046}
3047
3048static void __net_exit xfrm_net_exit(struct net *net)
3049{
3050	flow_cache_fini(net);
3051	xfrm_sysctl_fini(net);
3052	xfrm_policy_fini(net);
3053	xfrm_state_fini(net);
3054	xfrm_statistics_fini(net);
3055}
3056
3057static struct pernet_operations __net_initdata xfrm_net_ops = {
3058	.init = xfrm_net_init,
3059	.exit = xfrm_net_exit,
3060};
3061
3062void __init xfrm_init(void)
3063{
3064	register_pernet_subsys(&xfrm_net_ops);
3065	xfrm_input_init();
3066}
3067
3068#ifdef CONFIG_AUDITSYSCALL
3069static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
3070					 struct audit_buffer *audit_buf)
3071{
3072	struct xfrm_sec_ctx *ctx = xp->security;
3073	struct xfrm_selector *sel = &xp->selector;
3074
3075	if (ctx)
3076		audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
3077				 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
3078
3079	switch (sel->family) {
3080	case AF_INET:
3081		audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
3082		if (sel->prefixlen_s != 32)
3083			audit_log_format(audit_buf, " src_prefixlen=%d",
3084					 sel->prefixlen_s);
3085		audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
3086		if (sel->prefixlen_d != 32)
3087			audit_log_format(audit_buf, " dst_prefixlen=%d",
3088					 sel->prefixlen_d);
3089		break;
3090	case AF_INET6:
3091		audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
3092		if (sel->prefixlen_s != 128)
3093			audit_log_format(audit_buf, " src_prefixlen=%d",
3094					 sel->prefixlen_s);
3095		audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
3096		if (sel->prefixlen_d != 128)
3097			audit_log_format(audit_buf, " dst_prefixlen=%d",
3098					 sel->prefixlen_d);
3099		break;
3100	}
3101}
3102
3103void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
3104{
3105	struct audit_buffer *audit_buf;
3106
3107	audit_buf = xfrm_audit_start("SPD-add");
3108	if (audit_buf == NULL)
3109		return;
3110	xfrm_audit_helper_usrinfo(task_valid, audit_buf);
3111	audit_log_format(audit_buf, " res=%u", result);
3112	xfrm_audit_common_policyinfo(xp, audit_buf);
3113	audit_log_end(audit_buf);
3114}
3115EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
3116
3117void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
3118			      bool task_valid)
3119{
3120	struct audit_buffer *audit_buf;
3121
3122	audit_buf = xfrm_audit_start("SPD-delete");
3123	if (audit_buf == NULL)
3124		return;
3125	xfrm_audit_helper_usrinfo(task_valid, audit_buf);
3126	audit_log_format(audit_buf, " res=%u", result);
3127	xfrm_audit_common_policyinfo(xp, audit_buf);
3128	audit_log_end(audit_buf);
3129}
3130EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
3131#endif
3132
3133#ifdef CONFIG_XFRM_MIGRATE
3134static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
3135					const struct xfrm_selector *sel_tgt)
3136{
3137	if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
3138		if (sel_tgt->family == sel_cmp->family &&
3139		    xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
3140				    sel_cmp->family) &&
3141		    xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
3142				    sel_cmp->family) &&
3143		    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
3144		    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
3145			return true;
3146		}
3147	} else {
3148		if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
3149			return true;
3150		}
3151	}
3152	return false;
3153}
3154
3155static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
3156						    u8 dir, u8 type, struct net *net)
3157{
3158	struct xfrm_policy *pol, *ret = NULL;
3159	struct hlist_head *chain;
3160	u32 priority = ~0U;
3161
3162	read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
3163	chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
3164	hlist_for_each_entry(pol, chain, bydst) {
3165		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
3166		    pol->type == type) {
3167			ret = pol;
3168			priority = ret->priority;
3169			break;
3170		}
3171	}
3172	chain = &net->xfrm.policy_inexact[dir];
3173	hlist_for_each_entry(pol, chain, bydst) {
3174		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
3175		    pol->type == type &&
3176		    pol->priority < priority) {
3177			ret = pol;
3178			break;
3179		}
3180	}
3181
3182	if (ret)
3183		xfrm_pol_hold(ret);
3184
3185	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
3186
3187	return ret;
3188}
3189
3190static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
3191{
3192	int match = 0;
3193
3194	if (t->mode == m->mode && t->id.proto == m->proto &&
3195	    (m->reqid == 0 || t->reqid == m->reqid)) {
3196		switch (t->mode) {
3197		case XFRM_MODE_TUNNEL:
3198		case XFRM_MODE_BEET:
3199			if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
3200					    m->old_family) &&
3201			    xfrm_addr_equal(&t->saddr, &m->old_saddr,
3202					    m->old_family)) {
3203				match = 1;
3204			}
3205			break;
3206		case XFRM_MODE_TRANSPORT:
3207			/* in case of transport mode, template does not store
3208			   any IP addresses, hence we just compare mode and
3209			   protocol */
3210			match = 1;
3211			break;
3212		default:
3213			break;
3214		}
3215	}
3216	return match;
3217}
3218
3219/* update endpoint address(es) of template(s) */
3220static int xfrm_policy_migrate(struct xfrm_policy *pol,
3221			       struct xfrm_migrate *m, int num_migrate)
3222{
3223	struct xfrm_migrate *mp;
3224	int i, j, n = 0;
3225
3226	write_lock_bh(&pol->lock);
3227	if (unlikely(pol->walk.dead)) {
3228		/* target policy has been deleted */
3229		write_unlock_bh(&pol->lock);
3230		return -ENOENT;
3231	}
3232
3233	for (i = 0; i < pol->xfrm_nr; i++) {
3234		for (j = 0, mp = m; j < num_migrate; j++, mp++) {
3235			if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
3236				continue;
3237			n++;
3238			if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
3239			    pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
3240				continue;
3241			/* update endpoints */
3242			memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
3243			       sizeof(pol->xfrm_vec[i].id.daddr));
3244			memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
3245			       sizeof(pol->xfrm_vec[i].saddr));
3246			pol->xfrm_vec[i].encap_family = mp->new_family;
3247			/* flush bundles */
3248			atomic_inc(&pol->genid);
3249		}
3250	}
3251
3252	write_unlock_bh(&pol->lock);
3253
3254	if (!n)
3255		return -ENODATA;
3256
3257	return 0;
3258}
3259
3260static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
3261{
3262	int i, j;
3263
3264	if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
3265		return -EINVAL;
3266
3267	for (i = 0; i < num_migrate; i++) {
3268		if (xfrm_addr_equal(&m[i].old_daddr, &m[i].new_daddr,
3269				    m[i].old_family) &&
3270		    xfrm_addr_equal(&m[i].old_saddr, &m[i].new_saddr,
3271				    m[i].old_family))
3272			return -EINVAL;
3273		if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
3274		    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
3275			return -EINVAL;
3276
3277		/* check if there is any duplicated entry */
3278		for (j = i + 1; j < num_migrate; j++) {
3279			if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
3280				    sizeof(m[i].old_daddr)) &&
3281			    !memcmp(&m[i].old_saddr, &m[j].old_saddr,
3282				    sizeof(m[i].old_saddr)) &&
3283			    m[i].proto == m[j].proto &&
3284			    m[i].mode == m[j].mode &&
3285			    m[i].reqid == m[j].reqid &&
3286			    m[i].old_family == m[j].old_family)
3287				return -EINVAL;
3288		}
3289	}
3290
3291	return 0;
3292}
3293
3294int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
3295		 struct xfrm_migrate *m, int num_migrate,
3296		 struct xfrm_kmaddress *k, struct net *net)
3297{
3298	int i, err, nx_cur = 0, nx_new = 0;
3299	struct xfrm_policy *pol = NULL;
3300	struct xfrm_state *x, *xc;
3301	struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
3302	struct xfrm_state *x_new[XFRM_MAX_DEPTH];
3303	struct xfrm_migrate *mp;
3304
3305	if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
3306		goto out;
3307
3308	/* Stage 1 - find policy */
3309	if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
3310		err = -ENOENT;
3311		goto out;
3312	}
3313
3314	/* Stage 2 - find and update state(s) */
3315	for (i = 0, mp = m; i < num_migrate; i++, mp++) {
3316		if ((x = xfrm_migrate_state_find(mp, net))) {
3317			x_cur[nx_cur] = x;
3318			nx_cur++;
3319			if ((xc = xfrm_state_migrate(x, mp))) {
3320				x_new[nx_new] = xc;
3321				nx_new++;
3322			} else {
3323				err = -ENODATA;
3324				goto restore_state;
3325			}
3326		}
3327	}
3328
3329	/* Stage 3 - update policy */
3330	if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
3331		goto restore_state;
3332
3333	/* Stage 4 - delete old state(s) */
3334	if (nx_cur) {
3335		xfrm_states_put(x_cur, nx_cur);
3336		xfrm_states_delete(x_cur, nx_cur);
3337	}
3338
3339	/* Stage 5 - announce */
3340	km_migrate(sel, dir, type, m, num_migrate, k);
3341
3342	xfrm_pol_put(pol);
3343
3344	return 0;
3345out:
3346	return err;
3347
3348restore_state:
3349	if (pol)
3350		xfrm_pol_put(pol);
3351	if (nx_cur)
3352		xfrm_states_put(x_cur, nx_cur);
3353	if (nx_new)
3354		xfrm_states_delete(x_new, nx_new);
3355
3356	return err;
3357}
3358EXPORT_SYMBOL(xfrm_migrate);
3359#endif
3360