root/net/xdp/xsk.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. xsk_is_setup_for_bpf_map
  2. xsk_umem_has_addrs
  3. xsk_umem_peek_addr
  4. xsk_umem_discard_addr
  5. xsk_set_rx_need_wakeup
  6. xsk_set_tx_need_wakeup
  7. xsk_clear_rx_need_wakeup
  8. xsk_clear_tx_need_wakeup
  9. xsk_umem_uses_need_wakeup
  10. __xsk_rcv_memcpy
  11. __xsk_rcv
  12. __xsk_rcv_zc
  13. xsk_is_bound
  14. xsk_rcv
  15. xsk_flush
  16. xsk_generic_rcv
  17. xsk_umem_complete_tx
  18. xsk_umem_consume_tx_done
  19. xsk_umem_consume_tx
  20. xsk_wakeup
  21. xsk_zc_xmit
  22. xsk_destruct_skb
  23. xsk_generic_xmit
  24. __xsk_sendmsg
  25. xsk_sendmsg
  26. xsk_poll
  27. xsk_init_queue
  28. xsk_unbind_dev
  29. xsk_get_map_list_entry
  30. xsk_delete_from_maps
  31. xsk_release
  32. xsk_lookup_xsk_from_fd
  33. xsk_check_page_contiguity
  34. xsk_bind
  35. xsk_setsockopt
  36. xsk_enter_rxtx_offsets
  37. xsk_enter_umem_offsets
  38. xsk_getsockopt
  39. xsk_mmap
  40. xsk_notifier
  41. xsk_destruct
  42. xsk_create
  43. xsk_net_init
  44. xsk_net_exit
  45. xsk_init

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* XDP sockets
   3  *
   4  * AF_XDP sockets allows a channel between XDP programs and userspace
   5  * applications.
   6  * Copyright(c) 2018 Intel Corporation.
   7  *
   8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9  *            Magnus Karlsson <magnus.karlsson@intel.com>
  10  */
  11 
  12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13 
  14 #include <linux/if_xdp.h>
  15 #include <linux/init.h>
  16 #include <linux/sched/mm.h>
  17 #include <linux/sched/signal.h>
  18 #include <linux/sched/task.h>
  19 #include <linux/socket.h>
  20 #include <linux/file.h>
  21 #include <linux/uaccess.h>
  22 #include <linux/net.h>
  23 #include <linux/netdevice.h>
  24 #include <linux/rculist.h>
  25 #include <net/xdp_sock.h>
  26 #include <net/xdp.h>
  27 
  28 #include "xsk_queue.h"
  29 #include "xdp_umem.h"
  30 #include "xsk.h"
  31 
  32 #define TX_BATCH_SIZE 16
  33 
  34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
  35 {
  36         return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
  37                 READ_ONCE(xs->umem->fq);
  38 }
  39 
  40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
  41 {
  42         return xskq_has_addrs(umem->fq, cnt);
  43 }
  44 EXPORT_SYMBOL(xsk_umem_has_addrs);
  45 
  46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
  47 {
  48         return xskq_peek_addr(umem->fq, addr, umem);
  49 }
  50 EXPORT_SYMBOL(xsk_umem_peek_addr);
  51 
  52 void xsk_umem_discard_addr(struct xdp_umem *umem)
  53 {
  54         xskq_discard_addr(umem->fq);
  55 }
  56 EXPORT_SYMBOL(xsk_umem_discard_addr);
  57 
  58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
  59 {
  60         if (umem->need_wakeup & XDP_WAKEUP_RX)
  61                 return;
  62 
  63         umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  64         umem->need_wakeup |= XDP_WAKEUP_RX;
  65 }
  66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  67 
  68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
  69 {
  70         struct xdp_sock *xs;
  71 
  72         if (umem->need_wakeup & XDP_WAKEUP_TX)
  73                 return;
  74 
  75         rcu_read_lock();
  76         list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
  77                 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  78         }
  79         rcu_read_unlock();
  80 
  81         umem->need_wakeup |= XDP_WAKEUP_TX;
  82 }
  83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  84 
  85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
  86 {
  87         if (!(umem->need_wakeup & XDP_WAKEUP_RX))
  88                 return;
  89 
  90         umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  91         umem->need_wakeup &= ~XDP_WAKEUP_RX;
  92 }
  93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  94 
  95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
  96 {
  97         struct xdp_sock *xs;
  98 
  99         if (!(umem->need_wakeup & XDP_WAKEUP_TX))
 100                 return;
 101 
 102         rcu_read_lock();
 103         list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 104                 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
 105         }
 106         rcu_read_unlock();
 107 
 108         umem->need_wakeup &= ~XDP_WAKEUP_TX;
 109 }
 110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
 111 
 112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 113 {
 114         return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
 115 }
 116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 117 
 118 /* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
 119  * each page. This is only required in copy mode.
 120  */
 121 static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
 122                              u32 len, u32 metalen)
 123 {
 124         void *to_buf = xdp_umem_get_data(umem, addr);
 125 
 126         addr = xsk_umem_add_offset_to_addr(addr);
 127         if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
 128                 void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
 129                 u64 page_start = addr & ~(PAGE_SIZE - 1);
 130                 u64 first_len = PAGE_SIZE - (addr - page_start);
 131 
 132                 memcpy(to_buf, from_buf, first_len);
 133                 memcpy(next_pg_addr, from_buf + first_len,
 134                        len + metalen - first_len);
 135 
 136                 return;
 137         }
 138 
 139         memcpy(to_buf, from_buf, len + metalen);
 140 }
 141 
 142 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 143 {
 144         u64 offset = xs->umem->headroom;
 145         u64 addr, memcpy_addr;
 146         void *from_buf;
 147         u32 metalen;
 148         int err;
 149 
 150         if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 151             len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 152                 xs->rx_dropped++;
 153                 return -ENOSPC;
 154         }
 155 
 156         if (unlikely(xdp_data_meta_unsupported(xdp))) {
 157                 from_buf = xdp->data;
 158                 metalen = 0;
 159         } else {
 160                 from_buf = xdp->data_meta;
 161                 metalen = xdp->data - xdp->data_meta;
 162         }
 163 
 164         memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 165         __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
 166 
 167         offset += metalen;
 168         addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 169         err = xskq_produce_batch_desc(xs->rx, addr, len);
 170         if (!err) {
 171                 xskq_discard_addr(xs->umem->fq);
 172                 xdp_return_buff(xdp);
 173                 return 0;
 174         }
 175 
 176         xs->rx_dropped++;
 177         return err;
 178 }
 179 
 180 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 181 {
 182         int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 183 
 184         if (err)
 185                 xs->rx_dropped++;
 186 
 187         return err;
 188 }
 189 
 190 static bool xsk_is_bound(struct xdp_sock *xs)
 191 {
 192         if (READ_ONCE(xs->state) == XSK_BOUND) {
 193                 /* Matches smp_wmb() in bind(). */
 194                 smp_rmb();
 195                 return true;
 196         }
 197         return false;
 198 }
 199 
 200 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 201 {
 202         u32 len;
 203 
 204         if (!xsk_is_bound(xs))
 205                 return -EINVAL;
 206 
 207         if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 208                 return -EINVAL;
 209 
 210         len = xdp->data_end - xdp->data;
 211 
 212         return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
 213                 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
 214 }
 215 
 216 void xsk_flush(struct xdp_sock *xs)
 217 {
 218         xskq_produce_flush_desc(xs->rx);
 219         xs->sk.sk_data_ready(&xs->sk);
 220 }
 221 
 222 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 223 {
 224         u32 metalen = xdp->data - xdp->data_meta;
 225         u32 len = xdp->data_end - xdp->data;
 226         u64 offset = xs->umem->headroom;
 227         void *buffer;
 228         u64 addr;
 229         int err;
 230 
 231         spin_lock_bh(&xs->rx_lock);
 232 
 233         if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
 234                 err = -EINVAL;
 235                 goto out_unlock;
 236         }
 237 
 238         if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 239             len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 240                 err = -ENOSPC;
 241                 goto out_drop;
 242         }
 243 
 244         addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 245         buffer = xdp_umem_get_data(xs->umem, addr);
 246         memcpy(buffer, xdp->data_meta, len + metalen);
 247 
 248         addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
 249         err = xskq_produce_batch_desc(xs->rx, addr, len);
 250         if (err)
 251                 goto out_drop;
 252 
 253         xskq_discard_addr(xs->umem->fq);
 254         xskq_produce_flush_desc(xs->rx);
 255 
 256         spin_unlock_bh(&xs->rx_lock);
 257 
 258         xs->sk.sk_data_ready(&xs->sk);
 259         return 0;
 260 
 261 out_drop:
 262         xs->rx_dropped++;
 263 out_unlock:
 264         spin_unlock_bh(&xs->rx_lock);
 265         return err;
 266 }
 267 
 268 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 269 {
 270         xskq_produce_flush_addr_n(umem->cq, nb_entries);
 271 }
 272 EXPORT_SYMBOL(xsk_umem_complete_tx);
 273 
 274 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 275 {
 276         struct xdp_sock *xs;
 277 
 278         rcu_read_lock();
 279         list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 280                 xs->sk.sk_write_space(&xs->sk);
 281         }
 282         rcu_read_unlock();
 283 }
 284 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 285 
 286 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 287 {
 288         struct xdp_sock *xs;
 289 
 290         rcu_read_lock();
 291         list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 292                 if (!xskq_peek_desc(xs->tx, desc, umem))
 293                         continue;
 294 
 295                 if (xskq_produce_addr_lazy(umem->cq, desc->addr))
 296                         goto out;
 297 
 298                 xskq_discard_desc(xs->tx);
 299                 rcu_read_unlock();
 300                 return true;
 301         }
 302 
 303 out:
 304         rcu_read_unlock();
 305         return false;
 306 }
 307 EXPORT_SYMBOL(xsk_umem_consume_tx);
 308 
 309 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 310 {
 311         struct net_device *dev = xs->dev;
 312         int err;
 313 
 314         rcu_read_lock();
 315         err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 316         rcu_read_unlock();
 317 
 318         return err;
 319 }
 320 
 321 static int xsk_zc_xmit(struct xdp_sock *xs)
 322 {
 323         return xsk_wakeup(xs, XDP_WAKEUP_TX);
 324 }
 325 
 326 static void xsk_destruct_skb(struct sk_buff *skb)
 327 {
 328         u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 329         struct xdp_sock *xs = xdp_sk(skb->sk);
 330         unsigned long flags;
 331 
 332         spin_lock_irqsave(&xs->tx_completion_lock, flags);
 333         WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 334         spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 335 
 336         sock_wfree(skb);
 337 }
 338 
 339 static int xsk_generic_xmit(struct sock *sk)
 340 {
 341         struct xdp_sock *xs = xdp_sk(sk);
 342         u32 max_batch = TX_BATCH_SIZE;
 343         bool sent_frame = false;
 344         struct xdp_desc desc;
 345         struct sk_buff *skb;
 346         int err = 0;
 347 
 348         mutex_lock(&xs->mutex);
 349 
 350         if (xs->queue_id >= xs->dev->real_num_tx_queues)
 351                 goto out;
 352 
 353         while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
 354                 char *buffer;
 355                 u64 addr;
 356                 u32 len;
 357 
 358                 if (max_batch-- == 0) {
 359                         err = -EAGAIN;
 360                         goto out;
 361                 }
 362 
 363                 len = desc.len;
 364                 skb = sock_alloc_send_skb(sk, len, 1, &err);
 365                 if (unlikely(!skb)) {
 366                         err = -EAGAIN;
 367                         goto out;
 368                 }
 369 
 370                 skb_put(skb, len);
 371                 addr = desc.addr;
 372                 buffer = xdp_umem_get_data(xs->umem, addr);
 373                 err = skb_store_bits(skb, 0, buffer, len);
 374                 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
 375                         kfree_skb(skb);
 376                         goto out;
 377                 }
 378 
 379                 skb->dev = xs->dev;
 380                 skb->priority = sk->sk_priority;
 381                 skb->mark = sk->sk_mark;
 382                 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 383                 skb->destructor = xsk_destruct_skb;
 384 
 385                 err = dev_direct_xmit(skb, xs->queue_id);
 386                 xskq_discard_desc(xs->tx);
 387                 /* Ignore NET_XMIT_CN as packet might have been sent */
 388                 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
 389                         /* SKB completed but not sent */
 390                         err = -EBUSY;
 391                         goto out;
 392                 }
 393 
 394                 sent_frame = true;
 395         }
 396 
 397 out:
 398         if (sent_frame)
 399                 sk->sk_write_space(sk);
 400 
 401         mutex_unlock(&xs->mutex);
 402         return err;
 403 }
 404 
 405 static int __xsk_sendmsg(struct sock *sk)
 406 {
 407         struct xdp_sock *xs = xdp_sk(sk);
 408 
 409         if (unlikely(!(xs->dev->flags & IFF_UP)))
 410                 return -ENETDOWN;
 411         if (unlikely(!xs->tx))
 412                 return -ENOBUFS;
 413 
 414         return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
 415 }
 416 
 417 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 418 {
 419         bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 420         struct sock *sk = sock->sk;
 421         struct xdp_sock *xs = xdp_sk(sk);
 422 
 423         if (unlikely(!xsk_is_bound(xs)))
 424                 return -ENXIO;
 425         if (unlikely(need_wait))
 426                 return -EOPNOTSUPP;
 427 
 428         return __xsk_sendmsg(sk);
 429 }
 430 
 431 static unsigned int xsk_poll(struct file *file, struct socket *sock,
 432                              struct poll_table_struct *wait)
 433 {
 434         unsigned int mask = datagram_poll(file, sock, wait);
 435         struct sock *sk = sock->sk;
 436         struct xdp_sock *xs = xdp_sk(sk);
 437         struct xdp_umem *umem;
 438 
 439         if (unlikely(!xsk_is_bound(xs)))
 440                 return mask;
 441 
 442         umem = xs->umem;
 443 
 444         if (umem->need_wakeup) {
 445                 if (xs->zc)
 446                         xsk_wakeup(xs, umem->need_wakeup);
 447                 else
 448                         /* Poll needs to drive Tx also in copy mode */
 449                         __xsk_sendmsg(sk);
 450         }
 451 
 452         if (xs->rx && !xskq_empty_desc(xs->rx))
 453                 mask |= POLLIN | POLLRDNORM;
 454         if (xs->tx && !xskq_full_desc(xs->tx))
 455                 mask |= POLLOUT | POLLWRNORM;
 456 
 457         return mask;
 458 }
 459 
 460 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 461                           bool umem_queue)
 462 {
 463         struct xsk_queue *q;
 464 
 465         if (entries == 0 || *queue || !is_power_of_2(entries))
 466                 return -EINVAL;
 467 
 468         q = xskq_create(entries, umem_queue);
 469         if (!q)
 470                 return -ENOMEM;
 471 
 472         /* Make sure queue is ready before it can be seen by others */
 473         smp_wmb();
 474         WRITE_ONCE(*queue, q);
 475         return 0;
 476 }
 477 
 478 static void xsk_unbind_dev(struct xdp_sock *xs)
 479 {
 480         struct net_device *dev = xs->dev;
 481 
 482         if (xs->state != XSK_BOUND)
 483                 return;
 484         WRITE_ONCE(xs->state, XSK_UNBOUND);
 485 
 486         /* Wait for driver to stop using the xdp socket. */
 487         xdp_del_sk_umem(xs->umem, xs);
 488         xs->dev = NULL;
 489         synchronize_net();
 490         dev_put(dev);
 491 }
 492 
 493 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
 494                                               struct xdp_sock ***map_entry)
 495 {
 496         struct xsk_map *map = NULL;
 497         struct xsk_map_node *node;
 498 
 499         *map_entry = NULL;
 500 
 501         spin_lock_bh(&xs->map_list_lock);
 502         node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
 503                                         node);
 504         if (node) {
 505                 WARN_ON(xsk_map_inc(node->map));
 506                 map = node->map;
 507                 *map_entry = node->map_entry;
 508         }
 509         spin_unlock_bh(&xs->map_list_lock);
 510         return map;
 511 }
 512 
 513 static void xsk_delete_from_maps(struct xdp_sock *xs)
 514 {
 515         /* This function removes the current XDP socket from all the
 516          * maps it resides in. We need to take extra care here, due to
 517          * the two locks involved. Each map has a lock synchronizing
 518          * updates to the entries, and each socket has a lock that
 519          * synchronizes access to the list of maps (map_list). For
 520          * deadlock avoidance the locks need to be taken in the order
 521          * "map lock"->"socket map list lock". We start off by
 522          * accessing the socket map list, and take a reference to the
 523          * map to guarantee existence between the
 524          * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
 525          * calls. Then we ask the map to remove the socket, which
 526          * tries to remove the socket from the map. Note that there
 527          * might be updates to the map between
 528          * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
 529          */
 530         struct xdp_sock **map_entry = NULL;
 531         struct xsk_map *map;
 532 
 533         while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
 534                 xsk_map_try_sock_delete(map, xs, map_entry);
 535                 xsk_map_put(map);
 536         }
 537 }
 538 
 539 static int xsk_release(struct socket *sock)
 540 {
 541         struct sock *sk = sock->sk;
 542         struct xdp_sock *xs = xdp_sk(sk);
 543         struct net *net;
 544 
 545         if (!sk)
 546                 return 0;
 547 
 548         net = sock_net(sk);
 549 
 550         mutex_lock(&net->xdp.lock);
 551         sk_del_node_init_rcu(sk);
 552         mutex_unlock(&net->xdp.lock);
 553 
 554         local_bh_disable();
 555         sock_prot_inuse_add(net, sk->sk_prot, -1);
 556         local_bh_enable();
 557 
 558         xsk_delete_from_maps(xs);
 559         mutex_lock(&xs->mutex);
 560         xsk_unbind_dev(xs);
 561         mutex_unlock(&xs->mutex);
 562 
 563         xskq_destroy(xs->rx);
 564         xskq_destroy(xs->tx);
 565 
 566         sock_orphan(sk);
 567         sock->sk = NULL;
 568 
 569         sk_refcnt_debug_release(sk);
 570         sock_put(sk);
 571 
 572         return 0;
 573 }
 574 
 575 static struct socket *xsk_lookup_xsk_from_fd(int fd)
 576 {
 577         struct socket *sock;
 578         int err;
 579 
 580         sock = sockfd_lookup(fd, &err);
 581         if (!sock)
 582                 return ERR_PTR(-ENOTSOCK);
 583 
 584         if (sock->sk->sk_family != PF_XDP) {
 585                 sockfd_put(sock);
 586                 return ERR_PTR(-ENOPROTOOPT);
 587         }
 588 
 589         return sock;
 590 }
 591 
 592 /* Check if umem pages are contiguous.
 593  * If zero-copy mode, use the DMA address to do the page contiguity check
 594  * For all other modes we use addr (kernel virtual address)
 595  * Store the result in the low bits of addr.
 596  */
 597 static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
 598 {
 599         struct xdp_umem_page *pgs = umem->pages;
 600         int i, is_contig;
 601 
 602         for (i = 0; i < umem->npgs - 1; i++) {
 603                 is_contig = (flags & XDP_ZEROCOPY) ?
 604                         (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
 605                         (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
 606                 pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
 607         }
 608 }
 609 
 610 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 611 {
 612         struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 613         struct sock *sk = sock->sk;
 614         struct xdp_sock *xs = xdp_sk(sk);
 615         struct net_device *dev;
 616         u32 flags, qid;
 617         int err = 0;
 618 
 619         if (addr_len < sizeof(struct sockaddr_xdp))
 620                 return -EINVAL;
 621         if (sxdp->sxdp_family != AF_XDP)
 622                 return -EINVAL;
 623 
 624         flags = sxdp->sxdp_flags;
 625         if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
 626                       XDP_USE_NEED_WAKEUP))
 627                 return -EINVAL;
 628 
 629         rtnl_lock();
 630         mutex_lock(&xs->mutex);
 631         if (xs->state != XSK_READY) {
 632                 err = -EBUSY;
 633                 goto out_release;
 634         }
 635 
 636         dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 637         if (!dev) {
 638                 err = -ENODEV;
 639                 goto out_release;
 640         }
 641 
 642         if (!xs->rx && !xs->tx) {
 643                 err = -EINVAL;
 644                 goto out_unlock;
 645         }
 646 
 647         qid = sxdp->sxdp_queue_id;
 648 
 649         if (flags & XDP_SHARED_UMEM) {
 650                 struct xdp_sock *umem_xs;
 651                 struct socket *sock;
 652 
 653                 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
 654                     (flags & XDP_USE_NEED_WAKEUP)) {
 655                         /* Cannot specify flags for shared sockets. */
 656                         err = -EINVAL;
 657                         goto out_unlock;
 658                 }
 659 
 660                 if (xs->umem) {
 661                         /* We have already our own. */
 662                         err = -EINVAL;
 663                         goto out_unlock;
 664                 }
 665 
 666                 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 667                 if (IS_ERR(sock)) {
 668                         err = PTR_ERR(sock);
 669                         goto out_unlock;
 670                 }
 671 
 672                 umem_xs = xdp_sk(sock->sk);
 673                 if (!xsk_is_bound(umem_xs)) {
 674                         err = -EBADF;
 675                         sockfd_put(sock);
 676                         goto out_unlock;
 677                 }
 678                 if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 679                         err = -EINVAL;
 680                         sockfd_put(sock);
 681                         goto out_unlock;
 682                 }
 683 
 684                 xdp_get_umem(umem_xs->umem);
 685                 WRITE_ONCE(xs->umem, umem_xs->umem);
 686                 sockfd_put(sock);
 687         } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 688                 err = -EINVAL;
 689                 goto out_unlock;
 690         } else {
 691                 /* This xsk has its own umem. */
 692                 xskq_set_umem(xs->umem->fq, xs->umem->size,
 693                               xs->umem->chunk_mask);
 694                 xskq_set_umem(xs->umem->cq, xs->umem->size,
 695                               xs->umem->chunk_mask);
 696 
 697                 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 698                 if (err)
 699                         goto out_unlock;
 700 
 701                 xsk_check_page_contiguity(xs->umem, flags);
 702         }
 703 
 704         xs->dev = dev;
 705         xs->zc = xs->umem->zc;
 706         xs->queue_id = qid;
 707         xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
 708         xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 709         xdp_add_sk_umem(xs->umem, xs);
 710 
 711 out_unlock:
 712         if (err) {
 713                 dev_put(dev);
 714         } else {
 715                 /* Matches smp_rmb() in bind() for shared umem
 716                  * sockets, and xsk_is_bound().
 717                  */
 718                 smp_wmb();
 719                 WRITE_ONCE(xs->state, XSK_BOUND);
 720         }
 721 out_release:
 722         mutex_unlock(&xs->mutex);
 723         rtnl_unlock();
 724         return err;
 725 }
 726 
 727 struct xdp_umem_reg_v1 {
 728         __u64 addr; /* Start of packet data area */
 729         __u64 len; /* Length of packet data area */
 730         __u32 chunk_size;
 731         __u32 headroom;
 732 };
 733 
 734 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 735                           char __user *optval, unsigned int optlen)
 736 {
 737         struct sock *sk = sock->sk;
 738         struct xdp_sock *xs = xdp_sk(sk);
 739         int err;
 740 
 741         if (level != SOL_XDP)
 742                 return -ENOPROTOOPT;
 743 
 744         switch (optname) {
 745         case XDP_RX_RING:
 746         case XDP_TX_RING:
 747         {
 748                 struct xsk_queue **q;
 749                 int entries;
 750 
 751                 if (optlen < sizeof(entries))
 752                         return -EINVAL;
 753                 if (copy_from_user(&entries, optval, sizeof(entries)))
 754                         return -EFAULT;
 755 
 756                 mutex_lock(&xs->mutex);
 757                 if (xs->state != XSK_READY) {
 758                         mutex_unlock(&xs->mutex);
 759                         return -EBUSY;
 760                 }
 761                 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 762                 err = xsk_init_queue(entries, q, false);
 763                 if (!err && optname == XDP_TX_RING)
 764                         /* Tx needs to be explicitly woken up the first time */
 765                         xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
 766                 mutex_unlock(&xs->mutex);
 767                 return err;
 768         }
 769         case XDP_UMEM_REG:
 770         {
 771                 size_t mr_size = sizeof(struct xdp_umem_reg);
 772                 struct xdp_umem_reg mr = {};
 773                 struct xdp_umem *umem;
 774 
 775                 if (optlen < sizeof(struct xdp_umem_reg_v1))
 776                         return -EINVAL;
 777                 else if (optlen < sizeof(mr))
 778                         mr_size = sizeof(struct xdp_umem_reg_v1);
 779 
 780                 if (copy_from_user(&mr, optval, mr_size))
 781                         return -EFAULT;
 782 
 783                 mutex_lock(&xs->mutex);
 784                 if (xs->state != XSK_READY || xs->umem) {
 785                         mutex_unlock(&xs->mutex);
 786                         return -EBUSY;
 787                 }
 788 
 789                 umem = xdp_umem_create(&mr);
 790                 if (IS_ERR(umem)) {
 791                         mutex_unlock(&xs->mutex);
 792                         return PTR_ERR(umem);
 793                 }
 794 
 795                 /* Make sure umem is ready before it can be seen by others */
 796                 smp_wmb();
 797                 WRITE_ONCE(xs->umem, umem);
 798                 mutex_unlock(&xs->mutex);
 799                 return 0;
 800         }
 801         case XDP_UMEM_FILL_RING:
 802         case XDP_UMEM_COMPLETION_RING:
 803         {
 804                 struct xsk_queue **q;
 805                 int entries;
 806 
 807                 if (copy_from_user(&entries, optval, sizeof(entries)))
 808                         return -EFAULT;
 809 
 810                 mutex_lock(&xs->mutex);
 811                 if (xs->state != XSK_READY) {
 812                         mutex_unlock(&xs->mutex);
 813                         return -EBUSY;
 814                 }
 815                 if (!xs->umem) {
 816                         mutex_unlock(&xs->mutex);
 817                         return -EINVAL;
 818                 }
 819 
 820                 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 821                         &xs->umem->cq;
 822                 err = xsk_init_queue(entries, q, true);
 823                 mutex_unlock(&xs->mutex);
 824                 return err;
 825         }
 826         default:
 827                 break;
 828         }
 829 
 830         return -ENOPROTOOPT;
 831 }
 832 
 833 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
 834 {
 835         ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 836         ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 837         ring->desc = offsetof(struct xdp_rxtx_ring, desc);
 838 }
 839 
 840 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
 841 {
 842         ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 843         ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 844         ring->desc = offsetof(struct xdp_umem_ring, desc);
 845 }
 846 
 847 static int xsk_getsockopt(struct socket *sock, int level, int optname,
 848                           char __user *optval, int __user *optlen)
 849 {
 850         struct sock *sk = sock->sk;
 851         struct xdp_sock *xs = xdp_sk(sk);
 852         int len;
 853 
 854         if (level != SOL_XDP)
 855                 return -ENOPROTOOPT;
 856 
 857         if (get_user(len, optlen))
 858                 return -EFAULT;
 859         if (len < 0)
 860                 return -EINVAL;
 861 
 862         switch (optname) {
 863         case XDP_STATISTICS:
 864         {
 865                 struct xdp_statistics stats;
 866 
 867                 if (len < sizeof(stats))
 868                         return -EINVAL;
 869 
 870                 mutex_lock(&xs->mutex);
 871                 stats.rx_dropped = xs->rx_dropped;
 872                 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
 873                 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
 874                 mutex_unlock(&xs->mutex);
 875 
 876                 if (copy_to_user(optval, &stats, sizeof(stats)))
 877                         return -EFAULT;
 878                 if (put_user(sizeof(stats), optlen))
 879                         return -EFAULT;
 880 
 881                 return 0;
 882         }
 883         case XDP_MMAP_OFFSETS:
 884         {
 885                 struct xdp_mmap_offsets off;
 886                 struct xdp_mmap_offsets_v1 off_v1;
 887                 bool flags_supported = true;
 888                 void *to_copy;
 889 
 890                 if (len < sizeof(off_v1))
 891                         return -EINVAL;
 892                 else if (len < sizeof(off))
 893                         flags_supported = false;
 894 
 895                 if (flags_supported) {
 896                         /* xdp_ring_offset is identical to xdp_ring_offset_v1
 897                          * except for the flags field added to the end.
 898                          */
 899                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
 900                                                &off.rx);
 901                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
 902                                                &off.tx);
 903                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
 904                                                &off.fr);
 905                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
 906                                                &off.cr);
 907                         off.rx.flags = offsetof(struct xdp_rxtx_ring,
 908                                                 ptrs.flags);
 909                         off.tx.flags = offsetof(struct xdp_rxtx_ring,
 910                                                 ptrs.flags);
 911                         off.fr.flags = offsetof(struct xdp_umem_ring,
 912                                                 ptrs.flags);
 913                         off.cr.flags = offsetof(struct xdp_umem_ring,
 914                                                 ptrs.flags);
 915 
 916                         len = sizeof(off);
 917                         to_copy = &off;
 918                 } else {
 919                         xsk_enter_rxtx_offsets(&off_v1.rx);
 920                         xsk_enter_rxtx_offsets(&off_v1.tx);
 921                         xsk_enter_umem_offsets(&off_v1.fr);
 922                         xsk_enter_umem_offsets(&off_v1.cr);
 923 
 924                         len = sizeof(off_v1);
 925                         to_copy = &off_v1;
 926                 }
 927 
 928                 if (copy_to_user(optval, to_copy, len))
 929                         return -EFAULT;
 930                 if (put_user(len, optlen))
 931                         return -EFAULT;
 932 
 933                 return 0;
 934         }
 935         case XDP_OPTIONS:
 936         {
 937                 struct xdp_options opts = {};
 938 
 939                 if (len < sizeof(opts))
 940                         return -EINVAL;
 941 
 942                 mutex_lock(&xs->mutex);
 943                 if (xs->zc)
 944                         opts.flags |= XDP_OPTIONS_ZEROCOPY;
 945                 mutex_unlock(&xs->mutex);
 946 
 947                 len = sizeof(opts);
 948                 if (copy_to_user(optval, &opts, len))
 949                         return -EFAULT;
 950                 if (put_user(len, optlen))
 951                         return -EFAULT;
 952 
 953                 return 0;
 954         }
 955         default:
 956                 break;
 957         }
 958 
 959         return -EOPNOTSUPP;
 960 }
 961 
 962 static int xsk_mmap(struct file *file, struct socket *sock,
 963                     struct vm_area_struct *vma)
 964 {
 965         loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 966         unsigned long size = vma->vm_end - vma->vm_start;
 967         struct xdp_sock *xs = xdp_sk(sock->sk);
 968         struct xsk_queue *q = NULL;
 969         struct xdp_umem *umem;
 970         unsigned long pfn;
 971         struct page *qpg;
 972 
 973         if (READ_ONCE(xs->state) != XSK_READY)
 974                 return -EBUSY;
 975 
 976         if (offset == XDP_PGOFF_RX_RING) {
 977                 q = READ_ONCE(xs->rx);
 978         } else if (offset == XDP_PGOFF_TX_RING) {
 979                 q = READ_ONCE(xs->tx);
 980         } else {
 981                 umem = READ_ONCE(xs->umem);
 982                 if (!umem)
 983                         return -EINVAL;
 984 
 985                 /* Matches the smp_wmb() in XDP_UMEM_REG */
 986                 smp_rmb();
 987                 if (offset == XDP_UMEM_PGOFF_FILL_RING)
 988                         q = READ_ONCE(umem->fq);
 989                 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
 990                         q = READ_ONCE(umem->cq);
 991         }
 992 
 993         if (!q)
 994                 return -EINVAL;
 995 
 996         /* Matches the smp_wmb() in xsk_init_queue */
 997         smp_rmb();
 998         qpg = virt_to_head_page(q->ring);
 999         if (size > page_size(qpg))
1000                 return -EINVAL;
1001 
1002         pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1003         return remap_pfn_range(vma, vma->vm_start, pfn,
1004                                size, vma->vm_page_prot);
1005 }
1006 
1007 static int xsk_notifier(struct notifier_block *this,
1008                         unsigned long msg, void *ptr)
1009 {
1010         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1011         struct net *net = dev_net(dev);
1012         struct sock *sk;
1013 
1014         switch (msg) {
1015         case NETDEV_UNREGISTER:
1016                 mutex_lock(&net->xdp.lock);
1017                 sk_for_each(sk, &net->xdp.list) {
1018                         struct xdp_sock *xs = xdp_sk(sk);
1019 
1020                         mutex_lock(&xs->mutex);
1021                         if (xs->dev == dev) {
1022                                 sk->sk_err = ENETDOWN;
1023                                 if (!sock_flag(sk, SOCK_DEAD))
1024                                         sk->sk_error_report(sk);
1025 
1026                                 xsk_unbind_dev(xs);
1027 
1028                                 /* Clear device references in umem. */
1029                                 xdp_umem_clear_dev(xs->umem);
1030                         }
1031                         mutex_unlock(&xs->mutex);
1032                 }
1033                 mutex_unlock(&net->xdp.lock);
1034                 break;
1035         }
1036         return NOTIFY_DONE;
1037 }
1038 
1039 static struct proto xsk_proto = {
1040         .name =         "XDP",
1041         .owner =        THIS_MODULE,
1042         .obj_size =     sizeof(struct xdp_sock),
1043 };
1044 
1045 static const struct proto_ops xsk_proto_ops = {
1046         .family         = PF_XDP,
1047         .owner          = THIS_MODULE,
1048         .release        = xsk_release,
1049         .bind           = xsk_bind,
1050         .connect        = sock_no_connect,
1051         .socketpair     = sock_no_socketpair,
1052         .accept         = sock_no_accept,
1053         .getname        = sock_no_getname,
1054         .poll           = xsk_poll,
1055         .ioctl          = sock_no_ioctl,
1056         .listen         = sock_no_listen,
1057         .shutdown       = sock_no_shutdown,
1058         .setsockopt     = xsk_setsockopt,
1059         .getsockopt     = xsk_getsockopt,
1060         .sendmsg        = xsk_sendmsg,
1061         .recvmsg        = sock_no_recvmsg,
1062         .mmap           = xsk_mmap,
1063         .sendpage       = sock_no_sendpage,
1064 };
1065 
1066 static void xsk_destruct(struct sock *sk)
1067 {
1068         struct xdp_sock *xs = xdp_sk(sk);
1069 
1070         if (!sock_flag(sk, SOCK_DEAD))
1071                 return;
1072 
1073         xdp_put_umem(xs->umem);
1074 
1075         sk_refcnt_debug_dec(sk);
1076 }
1077 
1078 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1079                       int kern)
1080 {
1081         struct sock *sk;
1082         struct xdp_sock *xs;
1083 
1084         if (!ns_capable(net->user_ns, CAP_NET_RAW))
1085                 return -EPERM;
1086         if (sock->type != SOCK_RAW)
1087                 return -ESOCKTNOSUPPORT;
1088 
1089         if (protocol)
1090                 return -EPROTONOSUPPORT;
1091 
1092         sock->state = SS_UNCONNECTED;
1093 
1094         sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1095         if (!sk)
1096                 return -ENOBUFS;
1097 
1098         sock->ops = &xsk_proto_ops;
1099 
1100         sock_init_data(sock, sk);
1101 
1102         sk->sk_family = PF_XDP;
1103 
1104         sk->sk_destruct = xsk_destruct;
1105         sk_refcnt_debug_inc(sk);
1106 
1107         sock_set_flag(sk, SOCK_RCU_FREE);
1108 
1109         xs = xdp_sk(sk);
1110         xs->state = XSK_READY;
1111         mutex_init(&xs->mutex);
1112         spin_lock_init(&xs->rx_lock);
1113         spin_lock_init(&xs->tx_completion_lock);
1114 
1115         INIT_LIST_HEAD(&xs->map_list);
1116         spin_lock_init(&xs->map_list_lock);
1117 
1118         mutex_lock(&net->xdp.lock);
1119         sk_add_node_rcu(sk, &net->xdp.list);
1120         mutex_unlock(&net->xdp.lock);
1121 
1122         local_bh_disable();
1123         sock_prot_inuse_add(net, &xsk_proto, 1);
1124         local_bh_enable();
1125 
1126         return 0;
1127 }
1128 
1129 static const struct net_proto_family xsk_family_ops = {
1130         .family = PF_XDP,
1131         .create = xsk_create,
1132         .owner  = THIS_MODULE,
1133 };
1134 
1135 static struct notifier_block xsk_netdev_notifier = {
1136         .notifier_call  = xsk_notifier,
1137 };
1138 
1139 static int __net_init xsk_net_init(struct net *net)
1140 {
1141         mutex_init(&net->xdp.lock);
1142         INIT_HLIST_HEAD(&net->xdp.list);
1143         return 0;
1144 }
1145 
1146 static void __net_exit xsk_net_exit(struct net *net)
1147 {
1148         WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1149 }
1150 
1151 static struct pernet_operations xsk_net_ops = {
1152         .init = xsk_net_init,
1153         .exit = xsk_net_exit,
1154 };
1155 
1156 static int __init xsk_init(void)
1157 {
1158         int err;
1159 
1160         err = proto_register(&xsk_proto, 0 /* no slab */);
1161         if (err)
1162                 goto out;
1163 
1164         err = sock_register(&xsk_family_ops);
1165         if (err)
1166                 goto out_proto;
1167 
1168         err = register_pernet_subsys(&xsk_net_ops);
1169         if (err)
1170                 goto out_sk;
1171 
1172         err = register_netdevice_notifier(&xsk_netdev_notifier);
1173         if (err)
1174                 goto out_pernet;
1175 
1176         return 0;
1177 
1178 out_pernet:
1179         unregister_pernet_subsys(&xsk_net_ops);
1180 out_sk:
1181         sock_unregister(PF_XDP);
1182 out_proto:
1183         proto_unregister(&xsk_proto);
1184 out:
1185         return err;
1186 }
1187 
1188 fs_initcall(xsk_init);

/* [<][>][^][v][top][bottom][index][help] */