This source file includes following definitions.
- xsk_is_setup_for_bpf_map
- xsk_umem_has_addrs
- xsk_umem_peek_addr
- xsk_umem_discard_addr
- xsk_set_rx_need_wakeup
- xsk_set_tx_need_wakeup
- xsk_clear_rx_need_wakeup
- xsk_clear_tx_need_wakeup
- xsk_umem_uses_need_wakeup
- __xsk_rcv_memcpy
- __xsk_rcv
- __xsk_rcv_zc
- xsk_is_bound
- xsk_rcv
- xsk_flush
- xsk_generic_rcv
- xsk_umem_complete_tx
- xsk_umem_consume_tx_done
- xsk_umem_consume_tx
- xsk_wakeup
- xsk_zc_xmit
- xsk_destruct_skb
- xsk_generic_xmit
- __xsk_sendmsg
- xsk_sendmsg
- xsk_poll
- xsk_init_queue
- xsk_unbind_dev
- xsk_get_map_list_entry
- xsk_delete_from_maps
- xsk_release
- xsk_lookup_xsk_from_fd
- xsk_check_page_contiguity
- xsk_bind
- xsk_setsockopt
- xsk_enter_rxtx_offsets
- xsk_enter_umem_offsets
- xsk_getsockopt
- xsk_mmap
- xsk_notifier
- xsk_destruct
- xsk_create
- xsk_net_init
- xsk_net_exit
- xsk_init
1
2
3
4
5
6
7
8
9
10
11
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31
32 #define TX_BATCH_SIZE 16
33
34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
35 {
36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
37 READ_ONCE(xs->umem->fq);
38 }
39
40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
41 {
42 return xskq_has_addrs(umem->fq, cnt);
43 }
44 EXPORT_SYMBOL(xsk_umem_has_addrs);
45
46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
47 {
48 return xskq_peek_addr(umem->fq, addr, umem);
49 }
50 EXPORT_SYMBOL(xsk_umem_peek_addr);
51
52 void xsk_umem_discard_addr(struct xdp_umem *umem)
53 {
54 xskq_discard_addr(umem->fq);
55 }
56 EXPORT_SYMBOL(xsk_umem_discard_addr);
57
58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
59 {
60 if (umem->need_wakeup & XDP_WAKEUP_RX)
61 return;
62
63 umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
64 umem->need_wakeup |= XDP_WAKEUP_RX;
65 }
66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
67
68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
69 {
70 struct xdp_sock *xs;
71
72 if (umem->need_wakeup & XDP_WAKEUP_TX)
73 return;
74
75 rcu_read_lock();
76 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
77 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
78 }
79 rcu_read_unlock();
80
81 umem->need_wakeup |= XDP_WAKEUP_TX;
82 }
83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
84
85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
86 {
87 if (!(umem->need_wakeup & XDP_WAKEUP_RX))
88 return;
89
90 umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
91 umem->need_wakeup &= ~XDP_WAKEUP_RX;
92 }
93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
94
95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
96 {
97 struct xdp_sock *xs;
98
99 if (!(umem->need_wakeup & XDP_WAKEUP_TX))
100 return;
101
102 rcu_read_lock();
103 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
104 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
105 }
106 rcu_read_unlock();
107
108 umem->need_wakeup &= ~XDP_WAKEUP_TX;
109 }
110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
111
112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
113 {
114 return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
115 }
116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
117
118
119
120
121 static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
122 u32 len, u32 metalen)
123 {
124 void *to_buf = xdp_umem_get_data(umem, addr);
125
126 addr = xsk_umem_add_offset_to_addr(addr);
127 if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
128 void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
129 u64 page_start = addr & ~(PAGE_SIZE - 1);
130 u64 first_len = PAGE_SIZE - (addr - page_start);
131
132 memcpy(to_buf, from_buf, first_len);
133 memcpy(next_pg_addr, from_buf + first_len,
134 len + metalen - first_len);
135
136 return;
137 }
138
139 memcpy(to_buf, from_buf, len + metalen);
140 }
141
142 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
143 {
144 u64 offset = xs->umem->headroom;
145 u64 addr, memcpy_addr;
146 void *from_buf;
147 u32 metalen;
148 int err;
149
150 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
151 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
152 xs->rx_dropped++;
153 return -ENOSPC;
154 }
155
156 if (unlikely(xdp_data_meta_unsupported(xdp))) {
157 from_buf = xdp->data;
158 metalen = 0;
159 } else {
160 from_buf = xdp->data_meta;
161 metalen = xdp->data - xdp->data_meta;
162 }
163
164 memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
165 __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
166
167 offset += metalen;
168 addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
169 err = xskq_produce_batch_desc(xs->rx, addr, len);
170 if (!err) {
171 xskq_discard_addr(xs->umem->fq);
172 xdp_return_buff(xdp);
173 return 0;
174 }
175
176 xs->rx_dropped++;
177 return err;
178 }
179
180 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
181 {
182 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
183
184 if (err)
185 xs->rx_dropped++;
186
187 return err;
188 }
189
190 static bool xsk_is_bound(struct xdp_sock *xs)
191 {
192 if (READ_ONCE(xs->state) == XSK_BOUND) {
193
194 smp_rmb();
195 return true;
196 }
197 return false;
198 }
199
200 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
201 {
202 u32 len;
203
204 if (!xsk_is_bound(xs))
205 return -EINVAL;
206
207 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
208 return -EINVAL;
209
210 len = xdp->data_end - xdp->data;
211
212 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
213 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
214 }
215
216 void xsk_flush(struct xdp_sock *xs)
217 {
218 xskq_produce_flush_desc(xs->rx);
219 xs->sk.sk_data_ready(&xs->sk);
220 }
221
222 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
223 {
224 u32 metalen = xdp->data - xdp->data_meta;
225 u32 len = xdp->data_end - xdp->data;
226 u64 offset = xs->umem->headroom;
227 void *buffer;
228 u64 addr;
229 int err;
230
231 spin_lock_bh(&xs->rx_lock);
232
233 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
234 err = -EINVAL;
235 goto out_unlock;
236 }
237
238 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
239 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
240 err = -ENOSPC;
241 goto out_drop;
242 }
243
244 addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
245 buffer = xdp_umem_get_data(xs->umem, addr);
246 memcpy(buffer, xdp->data_meta, len + metalen);
247
248 addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
249 err = xskq_produce_batch_desc(xs->rx, addr, len);
250 if (err)
251 goto out_drop;
252
253 xskq_discard_addr(xs->umem->fq);
254 xskq_produce_flush_desc(xs->rx);
255
256 spin_unlock_bh(&xs->rx_lock);
257
258 xs->sk.sk_data_ready(&xs->sk);
259 return 0;
260
261 out_drop:
262 xs->rx_dropped++;
263 out_unlock:
264 spin_unlock_bh(&xs->rx_lock);
265 return err;
266 }
267
268 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
269 {
270 xskq_produce_flush_addr_n(umem->cq, nb_entries);
271 }
272 EXPORT_SYMBOL(xsk_umem_complete_tx);
273
274 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
275 {
276 struct xdp_sock *xs;
277
278 rcu_read_lock();
279 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
280 xs->sk.sk_write_space(&xs->sk);
281 }
282 rcu_read_unlock();
283 }
284 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
285
286 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
287 {
288 struct xdp_sock *xs;
289
290 rcu_read_lock();
291 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
292 if (!xskq_peek_desc(xs->tx, desc, umem))
293 continue;
294
295 if (xskq_produce_addr_lazy(umem->cq, desc->addr))
296 goto out;
297
298 xskq_discard_desc(xs->tx);
299 rcu_read_unlock();
300 return true;
301 }
302
303 out:
304 rcu_read_unlock();
305 return false;
306 }
307 EXPORT_SYMBOL(xsk_umem_consume_tx);
308
309 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
310 {
311 struct net_device *dev = xs->dev;
312 int err;
313
314 rcu_read_lock();
315 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
316 rcu_read_unlock();
317
318 return err;
319 }
320
321 static int xsk_zc_xmit(struct xdp_sock *xs)
322 {
323 return xsk_wakeup(xs, XDP_WAKEUP_TX);
324 }
325
326 static void xsk_destruct_skb(struct sk_buff *skb)
327 {
328 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
329 struct xdp_sock *xs = xdp_sk(skb->sk);
330 unsigned long flags;
331
332 spin_lock_irqsave(&xs->tx_completion_lock, flags);
333 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
334 spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
335
336 sock_wfree(skb);
337 }
338
339 static int xsk_generic_xmit(struct sock *sk)
340 {
341 struct xdp_sock *xs = xdp_sk(sk);
342 u32 max_batch = TX_BATCH_SIZE;
343 bool sent_frame = false;
344 struct xdp_desc desc;
345 struct sk_buff *skb;
346 int err = 0;
347
348 mutex_lock(&xs->mutex);
349
350 if (xs->queue_id >= xs->dev->real_num_tx_queues)
351 goto out;
352
353 while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
354 char *buffer;
355 u64 addr;
356 u32 len;
357
358 if (max_batch-- == 0) {
359 err = -EAGAIN;
360 goto out;
361 }
362
363 len = desc.len;
364 skb = sock_alloc_send_skb(sk, len, 1, &err);
365 if (unlikely(!skb)) {
366 err = -EAGAIN;
367 goto out;
368 }
369
370 skb_put(skb, len);
371 addr = desc.addr;
372 buffer = xdp_umem_get_data(xs->umem, addr);
373 err = skb_store_bits(skb, 0, buffer, len);
374 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
375 kfree_skb(skb);
376 goto out;
377 }
378
379 skb->dev = xs->dev;
380 skb->priority = sk->sk_priority;
381 skb->mark = sk->sk_mark;
382 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
383 skb->destructor = xsk_destruct_skb;
384
385 err = dev_direct_xmit(skb, xs->queue_id);
386 xskq_discard_desc(xs->tx);
387
388 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
389
390 err = -EBUSY;
391 goto out;
392 }
393
394 sent_frame = true;
395 }
396
397 out:
398 if (sent_frame)
399 sk->sk_write_space(sk);
400
401 mutex_unlock(&xs->mutex);
402 return err;
403 }
404
405 static int __xsk_sendmsg(struct sock *sk)
406 {
407 struct xdp_sock *xs = xdp_sk(sk);
408
409 if (unlikely(!(xs->dev->flags & IFF_UP)))
410 return -ENETDOWN;
411 if (unlikely(!xs->tx))
412 return -ENOBUFS;
413
414 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
415 }
416
417 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
418 {
419 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
420 struct sock *sk = sock->sk;
421 struct xdp_sock *xs = xdp_sk(sk);
422
423 if (unlikely(!xsk_is_bound(xs)))
424 return -ENXIO;
425 if (unlikely(need_wait))
426 return -EOPNOTSUPP;
427
428 return __xsk_sendmsg(sk);
429 }
430
431 static unsigned int xsk_poll(struct file *file, struct socket *sock,
432 struct poll_table_struct *wait)
433 {
434 unsigned int mask = datagram_poll(file, sock, wait);
435 struct sock *sk = sock->sk;
436 struct xdp_sock *xs = xdp_sk(sk);
437 struct xdp_umem *umem;
438
439 if (unlikely(!xsk_is_bound(xs)))
440 return mask;
441
442 umem = xs->umem;
443
444 if (umem->need_wakeup) {
445 if (xs->zc)
446 xsk_wakeup(xs, umem->need_wakeup);
447 else
448
449 __xsk_sendmsg(sk);
450 }
451
452 if (xs->rx && !xskq_empty_desc(xs->rx))
453 mask |= POLLIN | POLLRDNORM;
454 if (xs->tx && !xskq_full_desc(xs->tx))
455 mask |= POLLOUT | POLLWRNORM;
456
457 return mask;
458 }
459
460 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
461 bool umem_queue)
462 {
463 struct xsk_queue *q;
464
465 if (entries == 0 || *queue || !is_power_of_2(entries))
466 return -EINVAL;
467
468 q = xskq_create(entries, umem_queue);
469 if (!q)
470 return -ENOMEM;
471
472
473 smp_wmb();
474 WRITE_ONCE(*queue, q);
475 return 0;
476 }
477
478 static void xsk_unbind_dev(struct xdp_sock *xs)
479 {
480 struct net_device *dev = xs->dev;
481
482 if (xs->state != XSK_BOUND)
483 return;
484 WRITE_ONCE(xs->state, XSK_UNBOUND);
485
486
487 xdp_del_sk_umem(xs->umem, xs);
488 xs->dev = NULL;
489 synchronize_net();
490 dev_put(dev);
491 }
492
493 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
494 struct xdp_sock ***map_entry)
495 {
496 struct xsk_map *map = NULL;
497 struct xsk_map_node *node;
498
499 *map_entry = NULL;
500
501 spin_lock_bh(&xs->map_list_lock);
502 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
503 node);
504 if (node) {
505 WARN_ON(xsk_map_inc(node->map));
506 map = node->map;
507 *map_entry = node->map_entry;
508 }
509 spin_unlock_bh(&xs->map_list_lock);
510 return map;
511 }
512
513 static void xsk_delete_from_maps(struct xdp_sock *xs)
514 {
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530 struct xdp_sock **map_entry = NULL;
531 struct xsk_map *map;
532
533 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
534 xsk_map_try_sock_delete(map, xs, map_entry);
535 xsk_map_put(map);
536 }
537 }
538
539 static int xsk_release(struct socket *sock)
540 {
541 struct sock *sk = sock->sk;
542 struct xdp_sock *xs = xdp_sk(sk);
543 struct net *net;
544
545 if (!sk)
546 return 0;
547
548 net = sock_net(sk);
549
550 mutex_lock(&net->xdp.lock);
551 sk_del_node_init_rcu(sk);
552 mutex_unlock(&net->xdp.lock);
553
554 local_bh_disable();
555 sock_prot_inuse_add(net, sk->sk_prot, -1);
556 local_bh_enable();
557
558 xsk_delete_from_maps(xs);
559 mutex_lock(&xs->mutex);
560 xsk_unbind_dev(xs);
561 mutex_unlock(&xs->mutex);
562
563 xskq_destroy(xs->rx);
564 xskq_destroy(xs->tx);
565
566 sock_orphan(sk);
567 sock->sk = NULL;
568
569 sk_refcnt_debug_release(sk);
570 sock_put(sk);
571
572 return 0;
573 }
574
575 static struct socket *xsk_lookup_xsk_from_fd(int fd)
576 {
577 struct socket *sock;
578 int err;
579
580 sock = sockfd_lookup(fd, &err);
581 if (!sock)
582 return ERR_PTR(-ENOTSOCK);
583
584 if (sock->sk->sk_family != PF_XDP) {
585 sockfd_put(sock);
586 return ERR_PTR(-ENOPROTOOPT);
587 }
588
589 return sock;
590 }
591
592
593
594
595
596
597 static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
598 {
599 struct xdp_umem_page *pgs = umem->pages;
600 int i, is_contig;
601
602 for (i = 0; i < umem->npgs - 1; i++) {
603 is_contig = (flags & XDP_ZEROCOPY) ?
604 (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
605 (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
606 pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
607 }
608 }
609
610 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
611 {
612 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
613 struct sock *sk = sock->sk;
614 struct xdp_sock *xs = xdp_sk(sk);
615 struct net_device *dev;
616 u32 flags, qid;
617 int err = 0;
618
619 if (addr_len < sizeof(struct sockaddr_xdp))
620 return -EINVAL;
621 if (sxdp->sxdp_family != AF_XDP)
622 return -EINVAL;
623
624 flags = sxdp->sxdp_flags;
625 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
626 XDP_USE_NEED_WAKEUP))
627 return -EINVAL;
628
629 rtnl_lock();
630 mutex_lock(&xs->mutex);
631 if (xs->state != XSK_READY) {
632 err = -EBUSY;
633 goto out_release;
634 }
635
636 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
637 if (!dev) {
638 err = -ENODEV;
639 goto out_release;
640 }
641
642 if (!xs->rx && !xs->tx) {
643 err = -EINVAL;
644 goto out_unlock;
645 }
646
647 qid = sxdp->sxdp_queue_id;
648
649 if (flags & XDP_SHARED_UMEM) {
650 struct xdp_sock *umem_xs;
651 struct socket *sock;
652
653 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
654 (flags & XDP_USE_NEED_WAKEUP)) {
655
656 err = -EINVAL;
657 goto out_unlock;
658 }
659
660 if (xs->umem) {
661
662 err = -EINVAL;
663 goto out_unlock;
664 }
665
666 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
667 if (IS_ERR(sock)) {
668 err = PTR_ERR(sock);
669 goto out_unlock;
670 }
671
672 umem_xs = xdp_sk(sock->sk);
673 if (!xsk_is_bound(umem_xs)) {
674 err = -EBADF;
675 sockfd_put(sock);
676 goto out_unlock;
677 }
678 if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
679 err = -EINVAL;
680 sockfd_put(sock);
681 goto out_unlock;
682 }
683
684 xdp_get_umem(umem_xs->umem);
685 WRITE_ONCE(xs->umem, umem_xs->umem);
686 sockfd_put(sock);
687 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
688 err = -EINVAL;
689 goto out_unlock;
690 } else {
691
692 xskq_set_umem(xs->umem->fq, xs->umem->size,
693 xs->umem->chunk_mask);
694 xskq_set_umem(xs->umem->cq, xs->umem->size,
695 xs->umem->chunk_mask);
696
697 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
698 if (err)
699 goto out_unlock;
700
701 xsk_check_page_contiguity(xs->umem, flags);
702 }
703
704 xs->dev = dev;
705 xs->zc = xs->umem->zc;
706 xs->queue_id = qid;
707 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
708 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
709 xdp_add_sk_umem(xs->umem, xs);
710
711 out_unlock:
712 if (err) {
713 dev_put(dev);
714 } else {
715
716
717
718 smp_wmb();
719 WRITE_ONCE(xs->state, XSK_BOUND);
720 }
721 out_release:
722 mutex_unlock(&xs->mutex);
723 rtnl_unlock();
724 return err;
725 }
726
727 struct xdp_umem_reg_v1 {
728 __u64 addr;
729 __u64 len;
730 __u32 chunk_size;
731 __u32 headroom;
732 };
733
734 static int xsk_setsockopt(struct socket *sock, int level, int optname,
735 char __user *optval, unsigned int optlen)
736 {
737 struct sock *sk = sock->sk;
738 struct xdp_sock *xs = xdp_sk(sk);
739 int err;
740
741 if (level != SOL_XDP)
742 return -ENOPROTOOPT;
743
744 switch (optname) {
745 case XDP_RX_RING:
746 case XDP_TX_RING:
747 {
748 struct xsk_queue **q;
749 int entries;
750
751 if (optlen < sizeof(entries))
752 return -EINVAL;
753 if (copy_from_user(&entries, optval, sizeof(entries)))
754 return -EFAULT;
755
756 mutex_lock(&xs->mutex);
757 if (xs->state != XSK_READY) {
758 mutex_unlock(&xs->mutex);
759 return -EBUSY;
760 }
761 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
762 err = xsk_init_queue(entries, q, false);
763 if (!err && optname == XDP_TX_RING)
764
765 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
766 mutex_unlock(&xs->mutex);
767 return err;
768 }
769 case XDP_UMEM_REG:
770 {
771 size_t mr_size = sizeof(struct xdp_umem_reg);
772 struct xdp_umem_reg mr = {};
773 struct xdp_umem *umem;
774
775 if (optlen < sizeof(struct xdp_umem_reg_v1))
776 return -EINVAL;
777 else if (optlen < sizeof(mr))
778 mr_size = sizeof(struct xdp_umem_reg_v1);
779
780 if (copy_from_user(&mr, optval, mr_size))
781 return -EFAULT;
782
783 mutex_lock(&xs->mutex);
784 if (xs->state != XSK_READY || xs->umem) {
785 mutex_unlock(&xs->mutex);
786 return -EBUSY;
787 }
788
789 umem = xdp_umem_create(&mr);
790 if (IS_ERR(umem)) {
791 mutex_unlock(&xs->mutex);
792 return PTR_ERR(umem);
793 }
794
795
796 smp_wmb();
797 WRITE_ONCE(xs->umem, umem);
798 mutex_unlock(&xs->mutex);
799 return 0;
800 }
801 case XDP_UMEM_FILL_RING:
802 case XDP_UMEM_COMPLETION_RING:
803 {
804 struct xsk_queue **q;
805 int entries;
806
807 if (copy_from_user(&entries, optval, sizeof(entries)))
808 return -EFAULT;
809
810 mutex_lock(&xs->mutex);
811 if (xs->state != XSK_READY) {
812 mutex_unlock(&xs->mutex);
813 return -EBUSY;
814 }
815 if (!xs->umem) {
816 mutex_unlock(&xs->mutex);
817 return -EINVAL;
818 }
819
820 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
821 &xs->umem->cq;
822 err = xsk_init_queue(entries, q, true);
823 mutex_unlock(&xs->mutex);
824 return err;
825 }
826 default:
827 break;
828 }
829
830 return -ENOPROTOOPT;
831 }
832
833 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
834 {
835 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
836 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
837 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
838 }
839
840 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
841 {
842 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
843 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
844 ring->desc = offsetof(struct xdp_umem_ring, desc);
845 }
846
847 static int xsk_getsockopt(struct socket *sock, int level, int optname,
848 char __user *optval, int __user *optlen)
849 {
850 struct sock *sk = sock->sk;
851 struct xdp_sock *xs = xdp_sk(sk);
852 int len;
853
854 if (level != SOL_XDP)
855 return -ENOPROTOOPT;
856
857 if (get_user(len, optlen))
858 return -EFAULT;
859 if (len < 0)
860 return -EINVAL;
861
862 switch (optname) {
863 case XDP_STATISTICS:
864 {
865 struct xdp_statistics stats;
866
867 if (len < sizeof(stats))
868 return -EINVAL;
869
870 mutex_lock(&xs->mutex);
871 stats.rx_dropped = xs->rx_dropped;
872 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
873 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
874 mutex_unlock(&xs->mutex);
875
876 if (copy_to_user(optval, &stats, sizeof(stats)))
877 return -EFAULT;
878 if (put_user(sizeof(stats), optlen))
879 return -EFAULT;
880
881 return 0;
882 }
883 case XDP_MMAP_OFFSETS:
884 {
885 struct xdp_mmap_offsets off;
886 struct xdp_mmap_offsets_v1 off_v1;
887 bool flags_supported = true;
888 void *to_copy;
889
890 if (len < sizeof(off_v1))
891 return -EINVAL;
892 else if (len < sizeof(off))
893 flags_supported = false;
894
895 if (flags_supported) {
896
897
898
899 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
900 &off.rx);
901 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
902 &off.tx);
903 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
904 &off.fr);
905 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
906 &off.cr);
907 off.rx.flags = offsetof(struct xdp_rxtx_ring,
908 ptrs.flags);
909 off.tx.flags = offsetof(struct xdp_rxtx_ring,
910 ptrs.flags);
911 off.fr.flags = offsetof(struct xdp_umem_ring,
912 ptrs.flags);
913 off.cr.flags = offsetof(struct xdp_umem_ring,
914 ptrs.flags);
915
916 len = sizeof(off);
917 to_copy = &off;
918 } else {
919 xsk_enter_rxtx_offsets(&off_v1.rx);
920 xsk_enter_rxtx_offsets(&off_v1.tx);
921 xsk_enter_umem_offsets(&off_v1.fr);
922 xsk_enter_umem_offsets(&off_v1.cr);
923
924 len = sizeof(off_v1);
925 to_copy = &off_v1;
926 }
927
928 if (copy_to_user(optval, to_copy, len))
929 return -EFAULT;
930 if (put_user(len, optlen))
931 return -EFAULT;
932
933 return 0;
934 }
935 case XDP_OPTIONS:
936 {
937 struct xdp_options opts = {};
938
939 if (len < sizeof(opts))
940 return -EINVAL;
941
942 mutex_lock(&xs->mutex);
943 if (xs->zc)
944 opts.flags |= XDP_OPTIONS_ZEROCOPY;
945 mutex_unlock(&xs->mutex);
946
947 len = sizeof(opts);
948 if (copy_to_user(optval, &opts, len))
949 return -EFAULT;
950 if (put_user(len, optlen))
951 return -EFAULT;
952
953 return 0;
954 }
955 default:
956 break;
957 }
958
959 return -EOPNOTSUPP;
960 }
961
962 static int xsk_mmap(struct file *file, struct socket *sock,
963 struct vm_area_struct *vma)
964 {
965 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
966 unsigned long size = vma->vm_end - vma->vm_start;
967 struct xdp_sock *xs = xdp_sk(sock->sk);
968 struct xsk_queue *q = NULL;
969 struct xdp_umem *umem;
970 unsigned long pfn;
971 struct page *qpg;
972
973 if (READ_ONCE(xs->state) != XSK_READY)
974 return -EBUSY;
975
976 if (offset == XDP_PGOFF_RX_RING) {
977 q = READ_ONCE(xs->rx);
978 } else if (offset == XDP_PGOFF_TX_RING) {
979 q = READ_ONCE(xs->tx);
980 } else {
981 umem = READ_ONCE(xs->umem);
982 if (!umem)
983 return -EINVAL;
984
985
986 smp_rmb();
987 if (offset == XDP_UMEM_PGOFF_FILL_RING)
988 q = READ_ONCE(umem->fq);
989 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
990 q = READ_ONCE(umem->cq);
991 }
992
993 if (!q)
994 return -EINVAL;
995
996
997 smp_rmb();
998 qpg = virt_to_head_page(q->ring);
999 if (size > page_size(qpg))
1000 return -EINVAL;
1001
1002 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1003 return remap_pfn_range(vma, vma->vm_start, pfn,
1004 size, vma->vm_page_prot);
1005 }
1006
1007 static int xsk_notifier(struct notifier_block *this,
1008 unsigned long msg, void *ptr)
1009 {
1010 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1011 struct net *net = dev_net(dev);
1012 struct sock *sk;
1013
1014 switch (msg) {
1015 case NETDEV_UNREGISTER:
1016 mutex_lock(&net->xdp.lock);
1017 sk_for_each(sk, &net->xdp.list) {
1018 struct xdp_sock *xs = xdp_sk(sk);
1019
1020 mutex_lock(&xs->mutex);
1021 if (xs->dev == dev) {
1022 sk->sk_err = ENETDOWN;
1023 if (!sock_flag(sk, SOCK_DEAD))
1024 sk->sk_error_report(sk);
1025
1026 xsk_unbind_dev(xs);
1027
1028
1029 xdp_umem_clear_dev(xs->umem);
1030 }
1031 mutex_unlock(&xs->mutex);
1032 }
1033 mutex_unlock(&net->xdp.lock);
1034 break;
1035 }
1036 return NOTIFY_DONE;
1037 }
1038
1039 static struct proto xsk_proto = {
1040 .name = "XDP",
1041 .owner = THIS_MODULE,
1042 .obj_size = sizeof(struct xdp_sock),
1043 };
1044
1045 static const struct proto_ops xsk_proto_ops = {
1046 .family = PF_XDP,
1047 .owner = THIS_MODULE,
1048 .release = xsk_release,
1049 .bind = xsk_bind,
1050 .connect = sock_no_connect,
1051 .socketpair = sock_no_socketpair,
1052 .accept = sock_no_accept,
1053 .getname = sock_no_getname,
1054 .poll = xsk_poll,
1055 .ioctl = sock_no_ioctl,
1056 .listen = sock_no_listen,
1057 .shutdown = sock_no_shutdown,
1058 .setsockopt = xsk_setsockopt,
1059 .getsockopt = xsk_getsockopt,
1060 .sendmsg = xsk_sendmsg,
1061 .recvmsg = sock_no_recvmsg,
1062 .mmap = xsk_mmap,
1063 .sendpage = sock_no_sendpage,
1064 };
1065
1066 static void xsk_destruct(struct sock *sk)
1067 {
1068 struct xdp_sock *xs = xdp_sk(sk);
1069
1070 if (!sock_flag(sk, SOCK_DEAD))
1071 return;
1072
1073 xdp_put_umem(xs->umem);
1074
1075 sk_refcnt_debug_dec(sk);
1076 }
1077
1078 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1079 int kern)
1080 {
1081 struct sock *sk;
1082 struct xdp_sock *xs;
1083
1084 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1085 return -EPERM;
1086 if (sock->type != SOCK_RAW)
1087 return -ESOCKTNOSUPPORT;
1088
1089 if (protocol)
1090 return -EPROTONOSUPPORT;
1091
1092 sock->state = SS_UNCONNECTED;
1093
1094 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1095 if (!sk)
1096 return -ENOBUFS;
1097
1098 sock->ops = &xsk_proto_ops;
1099
1100 sock_init_data(sock, sk);
1101
1102 sk->sk_family = PF_XDP;
1103
1104 sk->sk_destruct = xsk_destruct;
1105 sk_refcnt_debug_inc(sk);
1106
1107 sock_set_flag(sk, SOCK_RCU_FREE);
1108
1109 xs = xdp_sk(sk);
1110 xs->state = XSK_READY;
1111 mutex_init(&xs->mutex);
1112 spin_lock_init(&xs->rx_lock);
1113 spin_lock_init(&xs->tx_completion_lock);
1114
1115 INIT_LIST_HEAD(&xs->map_list);
1116 spin_lock_init(&xs->map_list_lock);
1117
1118 mutex_lock(&net->xdp.lock);
1119 sk_add_node_rcu(sk, &net->xdp.list);
1120 mutex_unlock(&net->xdp.lock);
1121
1122 local_bh_disable();
1123 sock_prot_inuse_add(net, &xsk_proto, 1);
1124 local_bh_enable();
1125
1126 return 0;
1127 }
1128
1129 static const struct net_proto_family xsk_family_ops = {
1130 .family = PF_XDP,
1131 .create = xsk_create,
1132 .owner = THIS_MODULE,
1133 };
1134
1135 static struct notifier_block xsk_netdev_notifier = {
1136 .notifier_call = xsk_notifier,
1137 };
1138
1139 static int __net_init xsk_net_init(struct net *net)
1140 {
1141 mutex_init(&net->xdp.lock);
1142 INIT_HLIST_HEAD(&net->xdp.list);
1143 return 0;
1144 }
1145
1146 static void __net_exit xsk_net_exit(struct net *net)
1147 {
1148 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1149 }
1150
1151 static struct pernet_operations xsk_net_ops = {
1152 .init = xsk_net_init,
1153 .exit = xsk_net_exit,
1154 };
1155
1156 static int __init xsk_init(void)
1157 {
1158 int err;
1159
1160 err = proto_register(&xsk_proto, 0 );
1161 if (err)
1162 goto out;
1163
1164 err = sock_register(&xsk_family_ops);
1165 if (err)
1166 goto out_proto;
1167
1168 err = register_pernet_subsys(&xsk_net_ops);
1169 if (err)
1170 goto out_sk;
1171
1172 err = register_netdevice_notifier(&xsk_netdev_notifier);
1173 if (err)
1174 goto out_pernet;
1175
1176 return 0;
1177
1178 out_pernet:
1179 unregister_pernet_subsys(&xsk_net_ops);
1180 out_sk:
1181 sock_unregister(PF_XDP);
1182 out_proto:
1183 proto_unregister(&xsk_proto);
1184 out:
1185 return err;
1186 }
1187
1188 fs_initcall(xsk_init);