1/* Expectation handling for nf_conntrack. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6 * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/netfilter.h>
15#include <linux/skbuff.h>
16#include <linux/proc_fs.h>
17#include <linux/seq_file.h>
18#include <linux/stddef.h>
19#include <linux/slab.h>
20#include <linux/err.h>
21#include <linux/percpu.h>
22#include <linux/kernel.h>
23#include <linux/jhash.h>
24#include <linux/moduleparam.h>
25#include <linux/export.h>
26#include <net/net_namespace.h>
27
28#include <net/netfilter/nf_conntrack.h>
29#include <net/netfilter/nf_conntrack_core.h>
30#include <net/netfilter/nf_conntrack_expect.h>
31#include <net/netfilter/nf_conntrack_helper.h>
32#include <net/netfilter/nf_conntrack_tuple.h>
33#include <net/netfilter/nf_conntrack_zones.h>
34
35unsigned int nf_ct_expect_hsize __read_mostly;
36EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
37
38unsigned int nf_ct_expect_max __read_mostly;
39
40static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
41
42/* nf_conntrack_expect helper functions */
43void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
44				u32 portid, int report)
45{
46	struct nf_conn_help *master_help = nfct_help(exp->master);
47	struct net *net = nf_ct_exp_net(exp);
48
49	NF_CT_ASSERT(master_help);
50	NF_CT_ASSERT(!timer_pending(&exp->timeout));
51
52	hlist_del_rcu(&exp->hnode);
53	net->ct.expect_count--;
54
55	hlist_del(&exp->lnode);
56	master_help->expecting[exp->class]--;
57
58	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
59	nf_ct_expect_put(exp);
60
61	NF_CT_STAT_INC(net, expect_delete);
62}
63EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
64
65static void nf_ct_expectation_timed_out(unsigned long ul_expect)
66{
67	struct nf_conntrack_expect *exp = (void *)ul_expect;
68
69	spin_lock_bh(&nf_conntrack_expect_lock);
70	nf_ct_unlink_expect(exp);
71	spin_unlock_bh(&nf_conntrack_expect_lock);
72	nf_ct_expect_put(exp);
73}
74
75static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
76{
77	unsigned int hash;
78
79	if (unlikely(!nf_conntrack_hash_rnd)) {
80		init_nf_conntrack_hash_rnd();
81	}
82
83	hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
84		      (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
85		       (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
86
87	return reciprocal_scale(hash, nf_ct_expect_hsize);
88}
89
90struct nf_conntrack_expect *
91__nf_ct_expect_find(struct net *net, u16 zone,
92		    const struct nf_conntrack_tuple *tuple)
93{
94	struct nf_conntrack_expect *i;
95	unsigned int h;
96
97	if (!net->ct.expect_count)
98		return NULL;
99
100	h = nf_ct_expect_dst_hash(tuple);
101	hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
102		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
103		    nf_ct_zone(i->master) == zone)
104			return i;
105	}
106	return NULL;
107}
108EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
109
110/* Just find a expectation corresponding to a tuple. */
111struct nf_conntrack_expect *
112nf_ct_expect_find_get(struct net *net, u16 zone,
113		      const struct nf_conntrack_tuple *tuple)
114{
115	struct nf_conntrack_expect *i;
116
117	rcu_read_lock();
118	i = __nf_ct_expect_find(net, zone, tuple);
119	if (i && !atomic_inc_not_zero(&i->use))
120		i = NULL;
121	rcu_read_unlock();
122
123	return i;
124}
125EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
126
127/* If an expectation for this connection is found, it gets delete from
128 * global list then returned. */
129struct nf_conntrack_expect *
130nf_ct_find_expectation(struct net *net, u16 zone,
131		       const struct nf_conntrack_tuple *tuple)
132{
133	struct nf_conntrack_expect *i, *exp = NULL;
134	unsigned int h;
135
136	if (!net->ct.expect_count)
137		return NULL;
138
139	h = nf_ct_expect_dst_hash(tuple);
140	hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
141		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
142		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
143		    nf_ct_zone(i->master) == zone) {
144			exp = i;
145			break;
146		}
147	}
148	if (!exp)
149		return NULL;
150
151	/* If master is not in hash table yet (ie. packet hasn't left
152	   this machine yet), how can other end know about expected?
153	   Hence these are not the droids you are looking for (if
154	   master ct never got confirmed, we'd hold a reference to it
155	   and weird things would happen to future packets). */
156	if (!nf_ct_is_confirmed(exp->master))
157		return NULL;
158
159	/* Avoid race with other CPUs, that for exp->master ct, is
160	 * about to invoke ->destroy(), or nf_ct_delete() via timeout
161	 * or early_drop().
162	 *
163	 * The atomic_inc_not_zero() check tells:  If that fails, we
164	 * know that the ct is being destroyed.  If it succeeds, we
165	 * can be sure the ct cannot disappear underneath.
166	 */
167	if (unlikely(nf_ct_is_dying(exp->master) ||
168		     !atomic_inc_not_zero(&exp->master->ct_general.use)))
169		return NULL;
170
171	if (exp->flags & NF_CT_EXPECT_PERMANENT) {
172		atomic_inc(&exp->use);
173		return exp;
174	} else if (del_timer(&exp->timeout)) {
175		nf_ct_unlink_expect(exp);
176		return exp;
177	}
178	/* Undo exp->master refcnt increase, if del_timer() failed */
179	nf_ct_put(exp->master);
180
181	return NULL;
182}
183
184/* delete all expectations for this conntrack */
185void nf_ct_remove_expectations(struct nf_conn *ct)
186{
187	struct nf_conn_help *help = nfct_help(ct);
188	struct nf_conntrack_expect *exp;
189	struct hlist_node *next;
190
191	/* Optimization: most connection never expect any others. */
192	if (!help)
193		return;
194
195	spin_lock_bh(&nf_conntrack_expect_lock);
196	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
197		if (del_timer(&exp->timeout)) {
198			nf_ct_unlink_expect(exp);
199			nf_ct_expect_put(exp);
200		}
201	}
202	spin_unlock_bh(&nf_conntrack_expect_lock);
203}
204EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
205
206/* Would two expected things clash? */
207static inline int expect_clash(const struct nf_conntrack_expect *a,
208			       const struct nf_conntrack_expect *b)
209{
210	/* Part covered by intersection of masks must be unequal,
211	   otherwise they clash */
212	struct nf_conntrack_tuple_mask intersect_mask;
213	int count;
214
215	intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
216
217	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
218		intersect_mask.src.u3.all[count] =
219			a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
220	}
221
222	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
223	       nf_ct_zone(a->master) == nf_ct_zone(b->master);
224}
225
226static inline int expect_matches(const struct nf_conntrack_expect *a,
227				 const struct nf_conntrack_expect *b)
228{
229	return a->master == b->master && a->class == b->class &&
230		nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
231		nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
232		nf_ct_zone(a->master) == nf_ct_zone(b->master);
233}
234
235/* Generally a bad idea to call this: could have matched already. */
236void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
237{
238	spin_lock_bh(&nf_conntrack_expect_lock);
239	if (del_timer(&exp->timeout)) {
240		nf_ct_unlink_expect(exp);
241		nf_ct_expect_put(exp);
242	}
243	spin_unlock_bh(&nf_conntrack_expect_lock);
244}
245EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
246
247/* We don't increase the master conntrack refcount for non-fulfilled
248 * conntracks. During the conntrack destruction, the expectations are
249 * always killed before the conntrack itself */
250struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
251{
252	struct nf_conntrack_expect *new;
253
254	new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
255	if (!new)
256		return NULL;
257
258	new->master = me;
259	atomic_set(&new->use, 1);
260	return new;
261}
262EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
263
264void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
265		       u_int8_t family,
266		       const union nf_inet_addr *saddr,
267		       const union nf_inet_addr *daddr,
268		       u_int8_t proto, const __be16 *src, const __be16 *dst)
269{
270	int len;
271
272	if (family == AF_INET)
273		len = 4;
274	else
275		len = 16;
276
277	exp->flags = 0;
278	exp->class = class;
279	exp->expectfn = NULL;
280	exp->helper = NULL;
281	exp->tuple.src.l3num = family;
282	exp->tuple.dst.protonum = proto;
283
284	if (saddr) {
285		memcpy(&exp->tuple.src.u3, saddr, len);
286		if (sizeof(exp->tuple.src.u3) > len)
287			/* address needs to be cleared for nf_ct_tuple_equal */
288			memset((void *)&exp->tuple.src.u3 + len, 0x00,
289			       sizeof(exp->tuple.src.u3) - len);
290		memset(&exp->mask.src.u3, 0xFF, len);
291		if (sizeof(exp->mask.src.u3) > len)
292			memset((void *)&exp->mask.src.u3 + len, 0x00,
293			       sizeof(exp->mask.src.u3) - len);
294	} else {
295		memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
296		memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
297	}
298
299	if (src) {
300		exp->tuple.src.u.all = *src;
301		exp->mask.src.u.all = htons(0xFFFF);
302	} else {
303		exp->tuple.src.u.all = 0;
304		exp->mask.src.u.all = 0;
305	}
306
307	memcpy(&exp->tuple.dst.u3, daddr, len);
308	if (sizeof(exp->tuple.dst.u3) > len)
309		/* address needs to be cleared for nf_ct_tuple_equal */
310		memset((void *)&exp->tuple.dst.u3 + len, 0x00,
311		       sizeof(exp->tuple.dst.u3) - len);
312
313	exp->tuple.dst.u.all = *dst;
314
315#ifdef CONFIG_NF_NAT_NEEDED
316	memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
317	memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
318#endif
319}
320EXPORT_SYMBOL_GPL(nf_ct_expect_init);
321
322static void nf_ct_expect_free_rcu(struct rcu_head *head)
323{
324	struct nf_conntrack_expect *exp;
325
326	exp = container_of(head, struct nf_conntrack_expect, rcu);
327	kmem_cache_free(nf_ct_expect_cachep, exp);
328}
329
330void nf_ct_expect_put(struct nf_conntrack_expect *exp)
331{
332	if (atomic_dec_and_test(&exp->use))
333		call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
334}
335EXPORT_SYMBOL_GPL(nf_ct_expect_put);
336
337static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
338{
339	struct nf_conn_help *master_help = nfct_help(exp->master);
340	struct nf_conntrack_helper *helper;
341	struct net *net = nf_ct_exp_net(exp);
342	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
343
344	/* two references : one for hash insert, one for the timer */
345	atomic_add(2, &exp->use);
346
347	hlist_add_head(&exp->lnode, &master_help->expectations);
348	master_help->expecting[exp->class]++;
349
350	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
351	net->ct.expect_count++;
352
353	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
354		    (unsigned long)exp);
355	helper = rcu_dereference_protected(master_help->helper,
356					   lockdep_is_held(&nf_conntrack_expect_lock));
357	if (helper) {
358		exp->timeout.expires = jiffies +
359			helper->expect_policy[exp->class].timeout * HZ;
360	}
361	add_timer(&exp->timeout);
362
363	NF_CT_STAT_INC(net, expect_create);
364	return 0;
365}
366
367/* Race with expectations being used means we could have none to find; OK. */
368static void evict_oldest_expect(struct nf_conn *master,
369				struct nf_conntrack_expect *new)
370{
371	struct nf_conn_help *master_help = nfct_help(master);
372	struct nf_conntrack_expect *exp, *last = NULL;
373
374	hlist_for_each_entry(exp, &master_help->expectations, lnode) {
375		if (exp->class == new->class)
376			last = exp;
377	}
378
379	if (last && del_timer(&last->timeout)) {
380		nf_ct_unlink_expect(last);
381		nf_ct_expect_put(last);
382	}
383}
384
385static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
386{
387	const struct nf_conntrack_expect_policy *p;
388	struct nf_conntrack_expect *i;
389	struct nf_conn *master = expect->master;
390	struct nf_conn_help *master_help = nfct_help(master);
391	struct nf_conntrack_helper *helper;
392	struct net *net = nf_ct_exp_net(expect);
393	struct hlist_node *next;
394	unsigned int h;
395	int ret = 1;
396
397	if (!master_help) {
398		ret = -ESHUTDOWN;
399		goto out;
400	}
401	h = nf_ct_expect_dst_hash(&expect->tuple);
402	hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
403		if (expect_matches(i, expect)) {
404			if (del_timer(&i->timeout)) {
405				nf_ct_unlink_expect(i);
406				nf_ct_expect_put(i);
407				break;
408			}
409		} else if (expect_clash(i, expect)) {
410			ret = -EBUSY;
411			goto out;
412		}
413	}
414	/* Will be over limit? */
415	helper = rcu_dereference_protected(master_help->helper,
416					   lockdep_is_held(&nf_conntrack_expect_lock));
417	if (helper) {
418		p = &helper->expect_policy[expect->class];
419		if (p->max_expected &&
420		    master_help->expecting[expect->class] >= p->max_expected) {
421			evict_oldest_expect(master, expect);
422			if (master_help->expecting[expect->class]
423						>= p->max_expected) {
424				ret = -EMFILE;
425				goto out;
426			}
427		}
428	}
429
430	if (net->ct.expect_count >= nf_ct_expect_max) {
431		net_warn_ratelimited("nf_conntrack: expectation table full\n");
432		ret = -EMFILE;
433	}
434out:
435	return ret;
436}
437
438int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
439				u32 portid, int report)
440{
441	int ret;
442
443	spin_lock_bh(&nf_conntrack_expect_lock);
444	ret = __nf_ct_expect_check(expect);
445	if (ret <= 0)
446		goto out;
447
448	ret = nf_ct_expect_insert(expect);
449	if (ret < 0)
450		goto out;
451	spin_unlock_bh(&nf_conntrack_expect_lock);
452	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
453	return ret;
454out:
455	spin_unlock_bh(&nf_conntrack_expect_lock);
456	return ret;
457}
458EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
459
460#ifdef CONFIG_NF_CONNTRACK_PROCFS
461struct ct_expect_iter_state {
462	struct seq_net_private p;
463	unsigned int bucket;
464};
465
466static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
467{
468	struct net *net = seq_file_net(seq);
469	struct ct_expect_iter_state *st = seq->private;
470	struct hlist_node *n;
471
472	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
473		n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
474		if (n)
475			return n;
476	}
477	return NULL;
478}
479
480static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
481					     struct hlist_node *head)
482{
483	struct net *net = seq_file_net(seq);
484	struct ct_expect_iter_state *st = seq->private;
485
486	head = rcu_dereference(hlist_next_rcu(head));
487	while (head == NULL) {
488		if (++st->bucket >= nf_ct_expect_hsize)
489			return NULL;
490		head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
491	}
492	return head;
493}
494
495static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
496{
497	struct hlist_node *head = ct_expect_get_first(seq);
498
499	if (head)
500		while (pos && (head = ct_expect_get_next(seq, head)))
501			pos--;
502	return pos ? NULL : head;
503}
504
505static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
506	__acquires(RCU)
507{
508	rcu_read_lock();
509	return ct_expect_get_idx(seq, *pos);
510}
511
512static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
513{
514	(*pos)++;
515	return ct_expect_get_next(seq, v);
516}
517
518static void exp_seq_stop(struct seq_file *seq, void *v)
519	__releases(RCU)
520{
521	rcu_read_unlock();
522}
523
524static int exp_seq_show(struct seq_file *s, void *v)
525{
526	struct nf_conntrack_expect *expect;
527	struct nf_conntrack_helper *helper;
528	struct hlist_node *n = v;
529	char *delim = "";
530
531	expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
532
533	if (expect->timeout.function)
534		seq_printf(s, "%ld ", timer_pending(&expect->timeout)
535			   ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
536	else
537		seq_printf(s, "- ");
538	seq_printf(s, "l3proto = %u proto=%u ",
539		   expect->tuple.src.l3num,
540		   expect->tuple.dst.protonum);
541	print_tuple(s, &expect->tuple,
542		    __nf_ct_l3proto_find(expect->tuple.src.l3num),
543		    __nf_ct_l4proto_find(expect->tuple.src.l3num,
544				       expect->tuple.dst.protonum));
545
546	if (expect->flags & NF_CT_EXPECT_PERMANENT) {
547		seq_printf(s, "PERMANENT");
548		delim = ",";
549	}
550	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
551		seq_printf(s, "%sINACTIVE", delim);
552		delim = ",";
553	}
554	if (expect->flags & NF_CT_EXPECT_USERSPACE)
555		seq_printf(s, "%sUSERSPACE", delim);
556
557	helper = rcu_dereference(nfct_help(expect->master)->helper);
558	if (helper) {
559		seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
560		if (helper->expect_policy[expect->class].name)
561			seq_printf(s, "/%s",
562				   helper->expect_policy[expect->class].name);
563	}
564
565	seq_putc(s, '\n');
566
567	return 0;
568}
569
570static const struct seq_operations exp_seq_ops = {
571	.start = exp_seq_start,
572	.next = exp_seq_next,
573	.stop = exp_seq_stop,
574	.show = exp_seq_show
575};
576
577static int exp_open(struct inode *inode, struct file *file)
578{
579	return seq_open_net(inode, file, &exp_seq_ops,
580			sizeof(struct ct_expect_iter_state));
581}
582
583static const struct file_operations exp_file_ops = {
584	.owner   = THIS_MODULE,
585	.open    = exp_open,
586	.read    = seq_read,
587	.llseek  = seq_lseek,
588	.release = seq_release_net,
589};
590#endif /* CONFIG_NF_CONNTRACK_PROCFS */
591
592static int exp_proc_init(struct net *net)
593{
594#ifdef CONFIG_NF_CONNTRACK_PROCFS
595	struct proc_dir_entry *proc;
596
597	proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
598			   &exp_file_ops);
599	if (!proc)
600		return -ENOMEM;
601#endif /* CONFIG_NF_CONNTRACK_PROCFS */
602	return 0;
603}
604
605static void exp_proc_remove(struct net *net)
606{
607#ifdef CONFIG_NF_CONNTRACK_PROCFS
608	remove_proc_entry("nf_conntrack_expect", net->proc_net);
609#endif /* CONFIG_NF_CONNTRACK_PROCFS */
610}
611
612module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
613
614int nf_conntrack_expect_pernet_init(struct net *net)
615{
616	int err = -ENOMEM;
617
618	net->ct.expect_count = 0;
619	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
620	if (net->ct.expect_hash == NULL)
621		goto err1;
622
623	err = exp_proc_init(net);
624	if (err < 0)
625		goto err2;
626
627	return 0;
628err2:
629	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
630err1:
631	return err;
632}
633
634void nf_conntrack_expect_pernet_fini(struct net *net)
635{
636	exp_proc_remove(net);
637	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
638}
639
640int nf_conntrack_expect_init(void)
641{
642	if (!nf_ct_expect_hsize) {
643		nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
644		if (!nf_ct_expect_hsize)
645			nf_ct_expect_hsize = 1;
646	}
647	nf_ct_expect_max = nf_ct_expect_hsize * 4;
648	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
649				sizeof(struct nf_conntrack_expect),
650				0, 0, NULL);
651	if (!nf_ct_expect_cachep)
652		return -ENOMEM;
653	return 0;
654}
655
656void nf_conntrack_expect_fini(void)
657{
658	rcu_barrier(); /* Wait for call_rcu() before destroy */
659	kmem_cache_destroy(nf_ct_expect_cachep);
660}
661