1/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU General Public License as
5 * published by the Free Software Foundation; either version 2 of
6 * the License, or (at your option) any later version.
7 *
8 */
9
10#include "ipvlan.h"
11
12static u32 ipvlan_jhash_secret __read_mostly;
13
14void ipvlan_init_secret(void)
15{
16	net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret));
17}
18
19static void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
20			    unsigned int len, bool success, bool mcast)
21{
22	if (!ipvlan)
23		return;
24
25	if (likely(success)) {
26		struct ipvl_pcpu_stats *pcptr;
27
28		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
29		u64_stats_update_begin(&pcptr->syncp);
30		pcptr->rx_pkts++;
31		pcptr->rx_bytes += len;
32		if (mcast)
33			pcptr->rx_mcast++;
34		u64_stats_update_end(&pcptr->syncp);
35	} else {
36		this_cpu_inc(ipvlan->pcpu_stats->rx_errs);
37	}
38}
39
40static u8 ipvlan_get_v6_hash(const void *iaddr)
41{
42	const struct in6_addr *ip6_addr = iaddr;
43
44	return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) &
45	       IPVLAN_HASH_MASK;
46}
47
48static u8 ipvlan_get_v4_hash(const void *iaddr)
49{
50	const struct in_addr *ip4_addr = iaddr;
51
52	return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) &
53	       IPVLAN_HASH_MASK;
54}
55
56struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
57					const void *iaddr, bool is_v6)
58{
59	struct ipvl_addr *addr;
60	u8 hash;
61
62	hash = is_v6 ? ipvlan_get_v6_hash(iaddr) :
63	       ipvlan_get_v4_hash(iaddr);
64	hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) {
65		if (is_v6 && addr->atype == IPVL_IPV6 &&
66		    ipv6_addr_equal(&addr->ip6addr, iaddr))
67			return addr;
68		else if (!is_v6 && addr->atype == IPVL_IPV4 &&
69			 addr->ip4addr.s_addr ==
70				((struct in_addr *)iaddr)->s_addr)
71			return addr;
72	}
73	return NULL;
74}
75
76void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr)
77{
78	struct ipvl_port *port = ipvlan->port;
79	u8 hash;
80
81	hash = (addr->atype == IPVL_IPV6) ?
82	       ipvlan_get_v6_hash(&addr->ip6addr) :
83	       ipvlan_get_v4_hash(&addr->ip4addr);
84	if (hlist_unhashed(&addr->hlnode))
85		hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
86}
87
88void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync)
89{
90	hlist_del_init_rcu(&addr->hlnode);
91	if (sync)
92		synchronize_rcu();
93}
94
95struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
96				   const void *iaddr, bool is_v6)
97{
98	struct ipvl_addr *addr;
99
100	list_for_each_entry(addr, &ipvlan->addrs, anode) {
101		if ((is_v6 && addr->atype == IPVL_IPV6 &&
102		    ipv6_addr_equal(&addr->ip6addr, iaddr)) ||
103		    (!is_v6 && addr->atype == IPVL_IPV4 &&
104		    addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr))
105			return addr;
106	}
107	return NULL;
108}
109
110bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6)
111{
112	struct ipvl_dev *ipvlan;
113
114	ASSERT_RTNL();
115
116	list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
117		if (ipvlan_find_addr(ipvlan, iaddr, is_v6))
118			return true;
119	}
120	return false;
121}
122
123static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type)
124{
125	void *lyr3h = NULL;
126
127	switch (skb->protocol) {
128	case htons(ETH_P_ARP): {
129		struct arphdr *arph;
130
131		if (unlikely(!pskb_may_pull(skb, sizeof(*arph))))
132			return NULL;
133
134		arph = arp_hdr(skb);
135		*type = IPVL_ARP;
136		lyr3h = arph;
137		break;
138	}
139	case htons(ETH_P_IP): {
140		u32 pktlen;
141		struct iphdr *ip4h;
142
143		if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h))))
144			return NULL;
145
146		ip4h = ip_hdr(skb);
147		pktlen = ntohs(ip4h->tot_len);
148		if (ip4h->ihl < 5 || ip4h->version != 4)
149			return NULL;
150		if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
151			return NULL;
152
153		*type = IPVL_IPV4;
154		lyr3h = ip4h;
155		break;
156	}
157	case htons(ETH_P_IPV6): {
158		struct ipv6hdr *ip6h;
159
160		if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h))))
161			return NULL;
162
163		ip6h = ipv6_hdr(skb);
164		if (ip6h->version != 6)
165			return NULL;
166
167		*type = IPVL_IPV6;
168		lyr3h = ip6h;
169		/* Only Neighbour Solicitation pkts need different treatment */
170		if (ipv6_addr_any(&ip6h->saddr) &&
171		    ip6h->nexthdr == NEXTHDR_ICMP) {
172			*type = IPVL_ICMPV6;
173			lyr3h = ip6h + 1;
174		}
175		break;
176	}
177	default:
178		return NULL;
179	}
180
181	return lyr3h;
182}
183
184unsigned int ipvlan_mac_hash(const unsigned char *addr)
185{
186	u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2),
187			       ipvlan_jhash_secret);
188
189	return hash & IPVLAN_MAC_FILTER_MASK;
190}
191
192static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
193				   const struct ipvl_dev *in_dev, bool local)
194{
195	struct ethhdr *eth = eth_hdr(skb);
196	struct ipvl_dev *ipvlan;
197	struct sk_buff *nskb;
198	unsigned int len;
199	unsigned int mac_hash;
200	int ret;
201
202	if (skb->protocol == htons(ETH_P_PAUSE))
203		return;
204
205	rcu_read_lock();
206	list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
207		if (local && (ipvlan == in_dev))
208			continue;
209
210		mac_hash = ipvlan_mac_hash(eth->h_dest);
211		if (!test_bit(mac_hash, ipvlan->mac_filters))
212			continue;
213
214		ret = NET_RX_DROP;
215		len = skb->len + ETH_HLEN;
216		nskb = skb_clone(skb, GFP_ATOMIC);
217		if (!nskb)
218			goto mcast_acct;
219
220		if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast))
221			nskb->pkt_type = PACKET_BROADCAST;
222		else
223			nskb->pkt_type = PACKET_MULTICAST;
224
225		nskb->dev = ipvlan->dev;
226		if (local)
227			ret = dev_forward_skb(ipvlan->dev, nskb);
228		else
229			ret = netif_rx(nskb);
230mcast_acct:
231		ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
232	}
233	rcu_read_unlock();
234
235	/* Locally generated? ...Forward a copy to the main-device as
236	 * well. On the RX side we'll ignore it (wont give it to any
237	 * of the virtual devices.
238	 */
239	if (local) {
240		nskb = skb_clone(skb, GFP_ATOMIC);
241		if (nskb) {
242			if (ether_addr_equal(eth->h_dest, port->dev->broadcast))
243				nskb->pkt_type = PACKET_BROADCAST;
244			else
245				nskb->pkt_type = PACKET_MULTICAST;
246
247			dev_forward_skb(port->dev, nskb);
248		}
249	}
250}
251
252static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb,
253			    bool local)
254{
255	struct ipvl_dev *ipvlan = addr->master;
256	struct net_device *dev = ipvlan->dev;
257	unsigned int len;
258	rx_handler_result_t ret = RX_HANDLER_CONSUMED;
259	bool success = false;
260
261	len = skb->len + ETH_HLEN;
262	if (unlikely(!(dev->flags & IFF_UP))) {
263		kfree_skb(skb);
264		goto out;
265	}
266
267	skb = skb_share_check(skb, GFP_ATOMIC);
268	if (!skb)
269		goto out;
270
271	skb->dev = dev;
272	skb->pkt_type = PACKET_HOST;
273
274	if (local) {
275		if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
276			success = true;
277	} else {
278		ret = RX_HANDLER_ANOTHER;
279		success = true;
280	}
281
282out:
283	ipvlan_count_rx(ipvlan, len, success, false);
284	return ret;
285}
286
287static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port,
288					    void *lyr3h, int addr_type,
289					    bool use_dest)
290{
291	struct ipvl_addr *addr = NULL;
292
293	if (addr_type == IPVL_IPV6) {
294		struct ipv6hdr *ip6h;
295		struct in6_addr *i6addr;
296
297		ip6h = (struct ipv6hdr *)lyr3h;
298		i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr;
299		addr = ipvlan_ht_addr_lookup(port, i6addr, true);
300	} else if (addr_type == IPVL_ICMPV6) {
301		struct nd_msg *ndmh;
302		struct in6_addr *i6addr;
303
304		/* Make sure that the NeighborSolicitation ICMPv6 packets
305		 * are handled to avoid DAD issue.
306		 */
307		ndmh = (struct nd_msg *)lyr3h;
308		if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) {
309			i6addr = &ndmh->target;
310			addr = ipvlan_ht_addr_lookup(port, i6addr, true);
311		}
312	} else if (addr_type == IPVL_IPV4) {
313		struct iphdr *ip4h;
314		__be32 *i4addr;
315
316		ip4h = (struct iphdr *)lyr3h;
317		i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr;
318		addr = ipvlan_ht_addr_lookup(port, i4addr, false);
319	} else if (addr_type == IPVL_ARP) {
320		struct arphdr *arph;
321		unsigned char *arp_ptr;
322		__be32 dip;
323
324		arph = (struct arphdr *)lyr3h;
325		arp_ptr = (unsigned char *)(arph + 1);
326		if (use_dest)
327			arp_ptr += (2 * port->dev->addr_len) + 4;
328		else
329			arp_ptr += port->dev->addr_len;
330
331		memcpy(&dip, arp_ptr, 4);
332		addr = ipvlan_ht_addr_lookup(port, &dip, false);
333	}
334
335	return addr;
336}
337
338static int ipvlan_process_v4_outbound(struct sk_buff *skb)
339{
340	const struct iphdr *ip4h = ip_hdr(skb);
341	struct net_device *dev = skb->dev;
342	struct rtable *rt;
343	int err, ret = NET_XMIT_DROP;
344	struct flowi4 fl4 = {
345		.flowi4_oif = dev_get_iflink(dev),
346		.flowi4_tos = RT_TOS(ip4h->tos),
347		.flowi4_flags = FLOWI_FLAG_ANYSRC,
348		.daddr = ip4h->daddr,
349		.saddr = ip4h->saddr,
350	};
351
352	rt = ip_route_output_flow(dev_net(dev), &fl4, NULL);
353	if (IS_ERR(rt))
354		goto err;
355
356	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
357		ip_rt_put(rt);
358		goto err;
359	}
360	skb_dst_drop(skb);
361	skb_dst_set(skb, &rt->dst);
362	err = ip_local_out(skb);
363	if (unlikely(net_xmit_eval(err)))
364		dev->stats.tx_errors++;
365	else
366		ret = NET_XMIT_SUCCESS;
367	goto out;
368err:
369	dev->stats.tx_errors++;
370	kfree_skb(skb);
371out:
372	return ret;
373}
374
375static int ipvlan_process_v6_outbound(struct sk_buff *skb)
376{
377	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
378	struct net_device *dev = skb->dev;
379	struct dst_entry *dst;
380	int err, ret = NET_XMIT_DROP;
381	struct flowi6 fl6 = {
382		.flowi6_iif = skb->dev->ifindex,
383		.daddr = ip6h->daddr,
384		.saddr = ip6h->saddr,
385		.flowi6_flags = FLOWI_FLAG_ANYSRC,
386		.flowlabel = ip6_flowinfo(ip6h),
387		.flowi6_mark = skb->mark,
388		.flowi6_proto = ip6h->nexthdr,
389	};
390
391	dst = ip6_route_output(dev_net(dev), NULL, &fl6);
392	if (dst->error) {
393		ret = dst->error;
394		dst_release(dst);
395		goto err;
396	}
397	skb_dst_drop(skb);
398	skb_dst_set(skb, dst);
399	err = ip6_local_out(skb);
400	if (unlikely(net_xmit_eval(err)))
401		dev->stats.tx_errors++;
402	else
403		ret = NET_XMIT_SUCCESS;
404	goto out;
405err:
406	dev->stats.tx_errors++;
407	kfree_skb(skb);
408out:
409	return ret;
410}
411
412static int ipvlan_process_outbound(struct sk_buff *skb,
413				   const struct ipvl_dev *ipvlan)
414{
415	struct ethhdr *ethh = eth_hdr(skb);
416	int ret = NET_XMIT_DROP;
417
418	/* In this mode we dont care about multicast and broadcast traffic */
419	if (is_multicast_ether_addr(ethh->h_dest)) {
420		pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
421				    ntohs(skb->protocol));
422		kfree_skb(skb);
423		goto out;
424	}
425
426	/* The ipvlan is a pseudo-L2 device, so the packets that we receive
427	 * will have L2; which need to discarded and processed further
428	 * in the net-ns of the main-device.
429	 */
430	if (skb_mac_header_was_set(skb)) {
431		skb_pull(skb, sizeof(*ethh));
432		skb->mac_header = (typeof(skb->mac_header))~0U;
433		skb_reset_network_header(skb);
434	}
435
436	if (skb->protocol == htons(ETH_P_IPV6))
437		ret = ipvlan_process_v6_outbound(skb);
438	else if (skb->protocol == htons(ETH_P_IP))
439		ret = ipvlan_process_v4_outbound(skb);
440	else {
441		pr_warn_ratelimited("Dropped outbound packet type=%x\n",
442				    ntohs(skb->protocol));
443		kfree_skb(skb);
444	}
445out:
446	return ret;
447}
448
449static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
450{
451	const struct ipvl_dev *ipvlan = netdev_priv(dev);
452	void *lyr3h;
453	struct ipvl_addr *addr;
454	int addr_type;
455
456	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
457	if (!lyr3h)
458		goto out;
459
460	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
461	if (addr)
462		return ipvlan_rcv_frame(addr, skb, true);
463
464out:
465	skb->dev = ipvlan->phy_dev;
466	return ipvlan_process_outbound(skb, ipvlan);
467}
468
469static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
470{
471	const struct ipvl_dev *ipvlan = netdev_priv(dev);
472	struct ethhdr *eth = eth_hdr(skb);
473	struct ipvl_addr *addr;
474	void *lyr3h;
475	int addr_type;
476
477	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
478		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
479		if (lyr3h) {
480			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
481			if (addr)
482				return ipvlan_rcv_frame(addr, skb, true);
483		}
484		skb = skb_share_check(skb, GFP_ATOMIC);
485		if (!skb)
486			return NET_XMIT_DROP;
487
488		/* Packet definitely does not belong to any of the
489		 * virtual devices, but the dest is local. So forward
490		 * the skb for the main-dev. At the RX side we just return
491		 * RX_PASS for it to be processed further on the stack.
492		 */
493		return dev_forward_skb(ipvlan->phy_dev, skb);
494
495	} else if (is_multicast_ether_addr(eth->h_dest)) {
496		u8 ip_summed = skb->ip_summed;
497
498		skb->ip_summed = CHECKSUM_UNNECESSARY;
499		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
500		skb->ip_summed = ip_summed;
501	}
502
503	skb->dev = ipvlan->phy_dev;
504	return dev_queue_xmit(skb);
505}
506
507int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
508{
509	struct ipvl_dev *ipvlan = netdev_priv(dev);
510	struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev);
511
512	if (!port)
513		goto out;
514
515	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
516		goto out;
517
518	switch(port->mode) {
519	case IPVLAN_MODE_L2:
520		return ipvlan_xmit_mode_l2(skb, dev);
521	case IPVLAN_MODE_L3:
522		return ipvlan_xmit_mode_l3(skb, dev);
523	}
524
525	/* Should not reach here */
526	WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
527			  port->mode);
528out:
529	kfree_skb(skb);
530	return NET_XMIT_DROP;
531}
532
533static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port)
534{
535	struct ethhdr *eth = eth_hdr(skb);
536	struct ipvl_addr *addr;
537	void *lyr3h;
538	int addr_type;
539
540	if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) {
541		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
542		if (!lyr3h)
543			return true;
544
545		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false);
546		if (addr)
547			return false;
548	}
549
550	return true;
551}
552
553static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
554						 struct ipvl_port *port)
555{
556	void *lyr3h;
557	int addr_type;
558	struct ipvl_addr *addr;
559	struct sk_buff *skb = *pskb;
560	rx_handler_result_t ret = RX_HANDLER_PASS;
561
562	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
563	if (!lyr3h)
564		goto out;
565
566	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
567	if (addr)
568		ret = ipvlan_rcv_frame(addr, skb, false);
569
570out:
571	return ret;
572}
573
574static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
575						 struct ipvl_port *port)
576{
577	struct sk_buff *skb = *pskb;
578	struct ethhdr *eth = eth_hdr(skb);
579	rx_handler_result_t ret = RX_HANDLER_PASS;
580	void *lyr3h;
581	int addr_type;
582
583	if (is_multicast_ether_addr(eth->h_dest)) {
584		if (ipvlan_external_frame(skb, port))
585			ipvlan_multicast_frame(port, skb, NULL, false);
586	} else {
587		struct ipvl_addr *addr;
588
589		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
590		if (!lyr3h)
591			return ret;
592
593		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
594		if (addr)
595			ret = ipvlan_rcv_frame(addr, skb, false);
596	}
597
598	return ret;
599}
600
601rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
602{
603	struct sk_buff *skb = *pskb;
604	struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);
605
606	if (!port)
607		return RX_HANDLER_PASS;
608
609	switch (port->mode) {
610	case IPVLAN_MODE_L2:
611		return ipvlan_handle_mode_l2(pskb, port);
612	case IPVLAN_MODE_L3:
613		return ipvlan_handle_mode_l3(pskb, port);
614	}
615
616	/* Should not reach here */
617	WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
618			  port->mode);
619	kfree_skb(skb);
620	return NET_RX_DROP;
621}
622