1/* 2 * Copyright 2012 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/module.h> 16#include <linux/init.h> 17#include <linux/moduleparam.h> 18#include <linux/sched.h> 19#include <linux/kernel.h> /* printk() */ 20#include <linux/slab.h> /* kmalloc() */ 21#include <linux/errno.h> /* error codes */ 22#include <linux/types.h> /* size_t */ 23#include <linux/interrupt.h> 24#include <linux/in.h> 25#include <linux/irq.h> 26#include <linux/netdevice.h> /* struct device, and other headers */ 27#include <linux/etherdevice.h> /* eth_type_trans */ 28#include <linux/skbuff.h> 29#include <linux/ioctl.h> 30#include <linux/cdev.h> 31#include <linux/hugetlb.h> 32#include <linux/in6.h> 33#include <linux/timer.h> 34#include <linux/hrtimer.h> 35#include <linux/ktime.h> 36#include <linux/io.h> 37#include <linux/ctype.h> 38#include <linux/ip.h> 39#include <linux/ipv6.h> 40#include <linux/tcp.h> 41#include <linux/net_tstamp.h> 42#include <linux/ptp_clock_kernel.h> 43 44#include <asm/checksum.h> 45#include <asm/homecache.h> 46#include <gxio/mpipe.h> 47#include <arch/sim.h> 48 49/* Default transmit lockup timeout period, in jiffies. */ 50#define TILE_NET_TIMEOUT (5 * HZ) 51 52/* The maximum number of distinct channels (idesc.channel is 5 bits). */ 53#define TILE_NET_CHANNELS 32 54 55/* Maximum number of idescs to handle per "poll". */ 56#define TILE_NET_BATCH 128 57 58/* Maximum number of packets to handle per "poll". */ 59#define TILE_NET_WEIGHT 64 60 61/* Number of entries in each iqueue. */ 62#define IQUEUE_ENTRIES 512 63 64/* Number of entries in each equeue. */ 65#define EQUEUE_ENTRIES 2048 66 67/* Total header bytes per equeue slot. Must be big enough for 2 bytes 68 * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to 69 * 60 bytes of actual TCP header. We round up to align to cache lines. 70 */ 71#define HEADER_BYTES 128 72 73/* Maximum completions per cpu per device (must be a power of two). 74 * ISSUE: What is the right number here? If this is too small, then 75 * egress might block waiting for free space in a completions array. 76 * ISSUE: At the least, allocate these only for initialized echannels. 77 */ 78#define TILE_NET_MAX_COMPS 64 79 80#define MAX_FRAGS (MAX_SKB_FRAGS + 1) 81 82/* The "kinds" of buffer stacks (small/large/jumbo). */ 83#define MAX_KINDS 3 84 85/* Size of completions data to allocate. 86 * ISSUE: Probably more than needed since we don't use all the channels. 87 */ 88#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps)) 89 90/* Size of NotifRing data to allocate. */ 91#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t)) 92 93/* Timeout to wake the per-device TX timer after we stop the queue. 94 * We don't want the timeout too short (adds overhead, and might end 95 * up causing stop/wake/stop/wake cycles) or too long (affects performance). 96 * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets. 97 */ 98#define TX_TIMER_DELAY_USEC 30 99 100/* Timeout to wake the per-cpu egress timer to free completions. */ 101#define EGRESS_TIMER_DELAY_USEC 1000 102 103MODULE_AUTHOR("Tilera Corporation"); 104MODULE_LICENSE("GPL"); 105 106/* A "packet fragment" (a chunk of memory). */ 107struct frag { 108 void *buf; 109 size_t length; 110}; 111 112/* A single completion. */ 113struct tile_net_comp { 114 /* The "complete_count" when the completion will be complete. */ 115 s64 when; 116 /* The buffer to be freed when the completion is complete. */ 117 struct sk_buff *skb; 118}; 119 120/* The completions for a given cpu and echannel. */ 121struct tile_net_comps { 122 /* The completions. */ 123 struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS]; 124 /* The number of completions used. */ 125 unsigned long comp_next; 126 /* The number of completions freed. */ 127 unsigned long comp_last; 128}; 129 130/* The transmit wake timer for a given cpu and echannel. */ 131struct tile_net_tx_wake { 132 int tx_queue_idx; 133 struct hrtimer timer; 134 struct net_device *dev; 135}; 136 137/* Info for a specific cpu. */ 138struct tile_net_info { 139 /* Our cpu. */ 140 int my_cpu; 141 /* A timer for handling egress completions. */ 142 struct hrtimer egress_timer; 143 /* True if "egress_timer" is scheduled. */ 144 bool egress_timer_scheduled; 145 struct info_mpipe { 146 /* Packet queue. */ 147 gxio_mpipe_iqueue_t iqueue; 148 /* The NAPI struct. */ 149 struct napi_struct napi; 150 /* Number of buffers (by kind) which must still be provided. */ 151 unsigned int num_needed_buffers[MAX_KINDS]; 152 /* instance id. */ 153 int instance; 154 /* True if iqueue is valid. */ 155 bool has_iqueue; 156 /* NAPI flags. */ 157 bool napi_added; 158 bool napi_enabled; 159 /* Comps for each egress channel. */ 160 struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS]; 161 /* Transmit wake timer for each egress channel. */ 162 struct tile_net_tx_wake tx_wake[TILE_NET_CHANNELS]; 163 } mpipe[NR_MPIPE_MAX]; 164}; 165 166/* Info for egress on a particular egress channel. */ 167struct tile_net_egress { 168 /* The "equeue". */ 169 gxio_mpipe_equeue_t *equeue; 170 /* The headers for TSO. */ 171 unsigned char *headers; 172}; 173 174/* Info for a specific device. */ 175struct tile_net_priv { 176 /* Our network device. */ 177 struct net_device *dev; 178 /* The primary link. */ 179 gxio_mpipe_link_t link; 180 /* The primary channel, if open, else -1. */ 181 int channel; 182 /* The "loopify" egress link, if needed. */ 183 gxio_mpipe_link_t loopify_link; 184 /* The "loopify" egress channel, if open, else -1. */ 185 int loopify_channel; 186 /* The egress channel (channel or loopify_channel). */ 187 int echannel; 188 /* mPIPE instance, 0 or 1. */ 189 int instance; 190 /* The timestamp config. */ 191 struct hwtstamp_config stamp_cfg; 192}; 193 194static struct mpipe_data { 195 /* The ingress irq. */ 196 int ingress_irq; 197 198 /* The "context" for all devices. */ 199 gxio_mpipe_context_t context; 200 201 /* Egress info, indexed by "priv->echannel" 202 * (lazily created as needed). 203 */ 204 struct tile_net_egress 205 egress_for_echannel[TILE_NET_CHANNELS]; 206 207 /* Devices currently associated with each channel. 208 * NOTE: The array entry can become NULL after ifconfig down, but 209 * we do not free the underlying net_device structures, so it is 210 * safe to use a pointer after reading it from this array. 211 */ 212 struct net_device 213 *tile_net_devs_for_channel[TILE_NET_CHANNELS]; 214 215 /* The actual memory allocated for the buffer stacks. */ 216 void *buffer_stack_vas[MAX_KINDS]; 217 218 /* The amount of memory allocated for each buffer stack. */ 219 size_t buffer_stack_bytes[MAX_KINDS]; 220 221 /* The first buffer stack index 222 * (small = +0, large = +1, jumbo = +2). 223 */ 224 int first_buffer_stack; 225 226 /* The buckets. */ 227 int first_bucket; 228 int num_buckets; 229 230 /* PTP-specific data. */ 231 struct ptp_clock *ptp_clock; 232 struct ptp_clock_info caps; 233 234 /* Lock for ptp accessors. */ 235 struct mutex ptp_lock; 236 237} mpipe_data[NR_MPIPE_MAX] = { 238 [0 ... (NR_MPIPE_MAX - 1)] { 239 .ingress_irq = -1, 240 .first_buffer_stack = -1, 241 .first_bucket = -1, 242 .num_buckets = 1 243 } 244}; 245 246/* A mutex for "tile_net_devs_for_channel". */ 247static DEFINE_MUTEX(tile_net_devs_for_channel_mutex); 248 249/* The per-cpu info. */ 250static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info); 251 252 253/* The buffer size enums for each buffer stack. 254 * See arch/tile/include/gxio/mpipe.h for the set of possible values. 255 * We avoid the "10384" size because it can induce "false chaining" 256 * on "cut-through" jumbo packets. 257 */ 258static gxio_mpipe_buffer_size_enum_t buffer_size_enums[MAX_KINDS] = { 259 GXIO_MPIPE_BUFFER_SIZE_128, 260 GXIO_MPIPE_BUFFER_SIZE_1664, 261 GXIO_MPIPE_BUFFER_SIZE_16384 262}; 263 264/* Text value of tile_net.cpus if passed as a module parameter. */ 265static char *network_cpus_string; 266 267/* The actual cpus in "network_cpus". */ 268static struct cpumask network_cpus_map; 269 270/* If "tile_net.loopify=LINK" was specified, this is "LINK". */ 271static char *loopify_link_name; 272 273/* If "tile_net.custom" was specified, this is true. */ 274static bool custom_flag; 275 276/* If "tile_net.jumbo=NUM" was specified, this is "NUM". */ 277static uint jumbo_num; 278 279/* Obtain mpipe instance from struct tile_net_priv given struct net_device. */ 280static inline int mpipe_instance(struct net_device *dev) 281{ 282 struct tile_net_priv *priv = netdev_priv(dev); 283 return priv->instance; 284} 285 286/* The "tile_net.cpus" argument specifies the cpus that are dedicated 287 * to handle ingress packets. 288 * 289 * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where 290 * m, n, x, y are integer numbers that represent the cpus that can be 291 * neither a dedicated cpu nor a dataplane cpu. 292 */ 293static bool network_cpus_init(void) 294{ 295 int rc; 296 297 if (network_cpus_string == NULL) 298 return false; 299 300 rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map); 301 if (rc != 0) { 302 pr_warn("tile_net.cpus=%s: malformed cpu list\n", 303 network_cpus_string); 304 return false; 305 } 306 307 /* Remove dedicated cpus. */ 308 cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask); 309 310 if (cpumask_empty(&network_cpus_map)) { 311 pr_warn("Ignoring empty tile_net.cpus='%s'.\n", 312 network_cpus_string); 313 return false; 314 } 315 316 pr_info("Linux network CPUs: %*pbl\n", 317 cpumask_pr_args(&network_cpus_map)); 318 return true; 319} 320 321module_param_named(cpus, network_cpus_string, charp, 0444); 322MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts"); 323 324/* The "tile_net.loopify=LINK" argument causes the named device to 325 * actually use "loop0" for ingress, and "loop1" for egress. This 326 * allows an app to sit between the actual link and linux, passing 327 * (some) packets along to linux, and forwarding (some) packets sent 328 * out by linux. 329 */ 330module_param_named(loopify, loopify_link_name, charp, 0444); 331MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress"); 332 333/* The "tile_net.custom" argument causes us to ignore the "conventional" 334 * classifier metadata, in particular, the "l2_offset". 335 */ 336module_param_named(custom, custom_flag, bool, 0444); 337MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier"); 338 339/* The "tile_net.jumbo" argument causes us to support "jumbo" packets, 340 * and to allocate the given number of "jumbo" buffers. 341 */ 342module_param_named(jumbo, jumbo_num, uint, 0444); 343MODULE_PARM_DESC(jumbo, "the number of buffers to support jumbo packets"); 344 345/* Atomically update a statistics field. 346 * Note that on TILE-Gx, this operation is fire-and-forget on the 347 * issuing core (single-cycle dispatch) and takes only a few cycles 348 * longer than a regular store when the request reaches the home cache. 349 * No expensive bus management overhead is required. 350 */ 351static void tile_net_stats_add(unsigned long value, unsigned long *field) 352{ 353 BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long)); 354 atomic_long_add(value, (atomic_long_t *)field); 355} 356 357/* Allocate and push a buffer. */ 358static bool tile_net_provide_buffer(int instance, int kind) 359{ 360 struct mpipe_data *md = &mpipe_data[instance]; 361 gxio_mpipe_buffer_size_enum_t bse = buffer_size_enums[kind]; 362 size_t bs = gxio_mpipe_buffer_size_enum_to_buffer_size(bse); 363 const unsigned long buffer_alignment = 128; 364 struct sk_buff *skb; 365 int len; 366 367 len = sizeof(struct sk_buff **) + buffer_alignment + bs; 368 skb = dev_alloc_skb(len); 369 if (skb == NULL) 370 return false; 371 372 /* Make room for a back-pointer to 'skb' and guarantee alignment. */ 373 skb_reserve(skb, sizeof(struct sk_buff **)); 374 skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1)); 375 376 /* Save a back-pointer to 'skb'. */ 377 *(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb; 378 379 /* Make sure "skb" and the back-pointer have been flushed. */ 380 wmb(); 381 382 gxio_mpipe_push_buffer(&md->context, md->first_buffer_stack + kind, 383 (void *)va_to_tile_io_addr(skb->data)); 384 385 return true; 386} 387 388/* Convert a raw mpipe buffer to its matching skb pointer. */ 389static struct sk_buff *mpipe_buf_to_skb(void *va) 390{ 391 /* Acquire the associated "skb". */ 392 struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); 393 struct sk_buff *skb = *skb_ptr; 394 395 /* Paranoia. */ 396 if (skb->data != va) { 397 /* Panic here since there's a reasonable chance 398 * that corrupt buffers means generic memory 399 * corruption, with unpredictable system effects. 400 */ 401 panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p", 402 va, skb, skb->data); 403 } 404 405 return skb; 406} 407 408static void tile_net_pop_all_buffers(int instance, int stack) 409{ 410 struct mpipe_data *md = &mpipe_data[instance]; 411 412 for (;;) { 413 tile_io_addr_t addr = 414 (tile_io_addr_t)gxio_mpipe_pop_buffer(&md->context, 415 stack); 416 if (addr == 0) 417 break; 418 dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr))); 419 } 420} 421 422/* Provide linux buffers to mPIPE. */ 423static void tile_net_provide_needed_buffers(void) 424{ 425 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 426 int instance, kind; 427 for (instance = 0; instance < NR_MPIPE_MAX && 428 info->mpipe[instance].has_iqueue; instance++) { 429 for (kind = 0; kind < MAX_KINDS; kind++) { 430 while (info->mpipe[instance].num_needed_buffers[kind] 431 != 0) { 432 if (!tile_net_provide_buffer(instance, kind)) { 433 pr_notice("Tile %d still needs" 434 " some buffers\n", 435 info->my_cpu); 436 return; 437 } 438 info->mpipe[instance]. 439 num_needed_buffers[kind]--; 440 } 441 } 442 } 443} 444 445/* Get RX timestamp, and store it in the skb. */ 446static void tile_rx_timestamp(struct tile_net_priv *priv, struct sk_buff *skb, 447 gxio_mpipe_idesc_t *idesc) 448{ 449 if (unlikely(priv->stamp_cfg.rx_filter != HWTSTAMP_FILTER_NONE)) { 450 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); 451 memset(shhwtstamps, 0, sizeof(*shhwtstamps)); 452 shhwtstamps->hwtstamp = ktime_set(idesc->time_stamp_sec, 453 idesc->time_stamp_ns); 454 } 455} 456 457/* Get TX timestamp, and store it in the skb. */ 458static void tile_tx_timestamp(struct sk_buff *skb, int instance) 459{ 460 struct skb_shared_info *shtx = skb_shinfo(skb); 461 if (unlikely((shtx->tx_flags & SKBTX_HW_TSTAMP) != 0)) { 462 struct mpipe_data *md = &mpipe_data[instance]; 463 struct skb_shared_hwtstamps shhwtstamps; 464 struct timespec ts; 465 466 shtx->tx_flags |= SKBTX_IN_PROGRESS; 467 gxio_mpipe_get_timestamp(&md->context, &ts); 468 memset(&shhwtstamps, 0, sizeof(shhwtstamps)); 469 shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec); 470 skb_tstamp_tx(skb, &shhwtstamps); 471 } 472} 473 474/* Use ioctl() to enable or disable TX or RX timestamping. */ 475static int tile_hwtstamp_set(struct net_device *dev, struct ifreq *rq) 476{ 477 struct hwtstamp_config config; 478 struct tile_net_priv *priv = netdev_priv(dev); 479 480 if (copy_from_user(&config, rq->ifr_data, sizeof(config))) 481 return -EFAULT; 482 483 if (config.flags) /* reserved for future extensions */ 484 return -EINVAL; 485 486 switch (config.tx_type) { 487 case HWTSTAMP_TX_OFF: 488 case HWTSTAMP_TX_ON: 489 break; 490 default: 491 return -ERANGE; 492 } 493 494 switch (config.rx_filter) { 495 case HWTSTAMP_FILTER_NONE: 496 break; 497 case HWTSTAMP_FILTER_ALL: 498 case HWTSTAMP_FILTER_SOME: 499 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: 500 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: 501 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: 502 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: 503 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: 504 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: 505 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: 506 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: 507 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: 508 case HWTSTAMP_FILTER_PTP_V2_EVENT: 509 case HWTSTAMP_FILTER_PTP_V2_SYNC: 510 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: 511 config.rx_filter = HWTSTAMP_FILTER_ALL; 512 break; 513 default: 514 return -ERANGE; 515 } 516 517 if (copy_to_user(rq->ifr_data, &config, sizeof(config))) 518 return -EFAULT; 519 520 priv->stamp_cfg = config; 521 return 0; 522} 523 524static int tile_hwtstamp_get(struct net_device *dev, struct ifreq *rq) 525{ 526 struct tile_net_priv *priv = netdev_priv(dev); 527 528 if (copy_to_user(rq->ifr_data, &priv->stamp_cfg, 529 sizeof(priv->stamp_cfg))) 530 return -EFAULT; 531 532 return 0; 533} 534 535static inline bool filter_packet(struct net_device *dev, void *buf) 536{ 537 /* Filter packets received before we're up. */ 538 if (dev == NULL || !(dev->flags & IFF_UP)) 539 return true; 540 541 /* Filter out packets that aren't for us. */ 542 if (!(dev->flags & IFF_PROMISC) && 543 !is_multicast_ether_addr(buf) && 544 !ether_addr_equal(dev->dev_addr, buf)) 545 return true; 546 547 return false; 548} 549 550static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb, 551 gxio_mpipe_idesc_t *idesc, unsigned long len) 552{ 553 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 554 struct tile_net_priv *priv = netdev_priv(dev); 555 int instance = priv->instance; 556 557 /* Encode the actual packet length. */ 558 skb_put(skb, len); 559 560 skb->protocol = eth_type_trans(skb, dev); 561 562 /* Acknowledge "good" hardware checksums. */ 563 if (idesc->cs && idesc->csum_seed_val == 0xFFFF) 564 skb->ip_summed = CHECKSUM_UNNECESSARY; 565 566 /* Get RX timestamp from idesc. */ 567 tile_rx_timestamp(priv, skb, idesc); 568 569 napi_gro_receive(&info->mpipe[instance].napi, skb); 570 571 /* Update stats. */ 572 tile_net_stats_add(1, &dev->stats.rx_packets); 573 tile_net_stats_add(len, &dev->stats.rx_bytes); 574 575 /* Need a new buffer. */ 576 if (idesc->size == buffer_size_enums[0]) 577 info->mpipe[instance].num_needed_buffers[0]++; 578 else if (idesc->size == buffer_size_enums[1]) 579 info->mpipe[instance].num_needed_buffers[1]++; 580 else 581 info->mpipe[instance].num_needed_buffers[2]++; 582} 583 584/* Handle a packet. Return true if "processed", false if "filtered". */ 585static bool tile_net_handle_packet(int instance, gxio_mpipe_idesc_t *idesc) 586{ 587 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 588 struct mpipe_data *md = &mpipe_data[instance]; 589 struct net_device *dev = md->tile_net_devs_for_channel[idesc->channel]; 590 uint8_t l2_offset; 591 void *va; 592 void *buf; 593 unsigned long len; 594 bool filter; 595 596 /* Drop packets for which no buffer was available (which can 597 * happen under heavy load), or for which the me/tr/ce flags 598 * are set (which can happen for jumbo cut-through packets, 599 * or with a customized classifier). 600 */ 601 if (idesc->be || idesc->me || idesc->tr || idesc->ce) { 602 if (dev) 603 tile_net_stats_add(1, &dev->stats.rx_errors); 604 goto drop; 605 } 606 607 /* Get the "l2_offset", if allowed. */ 608 l2_offset = custom_flag ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc); 609 610 /* Get the VA (including NET_IP_ALIGN bytes of "headroom"). */ 611 va = tile_io_addr_to_va((unsigned long)idesc->va); 612 613 /* Get the actual packet start/length. */ 614 buf = va + l2_offset; 615 len = idesc->l2_size - l2_offset; 616 617 /* Point "va" at the raw buffer. */ 618 va -= NET_IP_ALIGN; 619 620 filter = filter_packet(dev, buf); 621 if (filter) { 622 if (dev) 623 tile_net_stats_add(1, &dev->stats.rx_dropped); 624drop: 625 gxio_mpipe_iqueue_drop(&info->mpipe[instance].iqueue, idesc); 626 } else { 627 struct sk_buff *skb = mpipe_buf_to_skb(va); 628 629 /* Skip headroom, and any custom header. */ 630 skb_reserve(skb, NET_IP_ALIGN + l2_offset); 631 632 tile_net_receive_skb(dev, skb, idesc, len); 633 } 634 635 gxio_mpipe_iqueue_consume(&info->mpipe[instance].iqueue, idesc); 636 return !filter; 637} 638 639/* Handle some packets for the current CPU. 640 * 641 * This function handles up to TILE_NET_BATCH idescs per call. 642 * 643 * ISSUE: Since we do not provide new buffers until this function is 644 * complete, we must initially provide enough buffers for each network 645 * cpu to fill its iqueue and also its batched idescs. 646 * 647 * ISSUE: The "rotting packet" race condition occurs if a packet 648 * arrives after the queue appears to be empty, and before the 649 * hypervisor interrupt is re-enabled. 650 */ 651static int tile_net_poll(struct napi_struct *napi, int budget) 652{ 653 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 654 unsigned int work = 0; 655 gxio_mpipe_idesc_t *idesc; 656 int instance, i, n; 657 struct mpipe_data *md; 658 struct info_mpipe *info_mpipe = 659 container_of(napi, struct info_mpipe, napi); 660 661 if (budget <= 0) 662 goto done; 663 664 instance = info_mpipe->instance; 665 while ((n = gxio_mpipe_iqueue_try_peek( 666 &info_mpipe->iqueue, 667 &idesc)) > 0) { 668 for (i = 0; i < n; i++) { 669 if (i == TILE_NET_BATCH) 670 goto done; 671 if (tile_net_handle_packet(instance, 672 idesc + i)) { 673 if (++work >= budget) 674 goto done; 675 } 676 } 677 } 678 679 /* There are no packets left. */ 680 napi_complete(&info_mpipe->napi); 681 682 md = &mpipe_data[instance]; 683 /* Re-enable hypervisor interrupts. */ 684 gxio_mpipe_enable_notif_ring_interrupt( 685 &md->context, info->mpipe[instance].iqueue.ring); 686 687 /* HACK: Avoid the "rotting packet" problem. */ 688 if (gxio_mpipe_iqueue_try_peek(&info_mpipe->iqueue, &idesc) > 0) 689 napi_schedule(&info_mpipe->napi); 690 691 /* ISSUE: Handle completions? */ 692 693done: 694 tile_net_provide_needed_buffers(); 695 696 return work; 697} 698 699/* Handle an ingress interrupt from an instance on the current cpu. */ 700static irqreturn_t tile_net_handle_ingress_irq(int irq, void *id) 701{ 702 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 703 napi_schedule(&info->mpipe[(uint64_t)id].napi); 704 return IRQ_HANDLED; 705} 706 707/* Free some completions. This must be called with interrupts blocked. */ 708static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue, 709 struct tile_net_comps *comps, 710 int limit, bool force_update) 711{ 712 int n = 0; 713 while (comps->comp_last < comps->comp_next) { 714 unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS; 715 struct tile_net_comp *comp = &comps->comp_queue[cid]; 716 if (!gxio_mpipe_equeue_is_complete(equeue, comp->when, 717 force_update || n == 0)) 718 break; 719 dev_kfree_skb_irq(comp->skb); 720 comps->comp_last++; 721 if (++n == limit) 722 break; 723 } 724 return n; 725} 726 727/* Add a completion. This must be called with interrupts blocked. 728 * tile_net_equeue_try_reserve() will have ensured a free completion entry. 729 */ 730static void add_comp(gxio_mpipe_equeue_t *equeue, 731 struct tile_net_comps *comps, 732 uint64_t when, struct sk_buff *skb) 733{ 734 int cid = comps->comp_next % TILE_NET_MAX_COMPS; 735 comps->comp_queue[cid].when = when; 736 comps->comp_queue[cid].skb = skb; 737 comps->comp_next++; 738} 739 740static void tile_net_schedule_tx_wake_timer(struct net_device *dev, 741 int tx_queue_idx) 742{ 743 struct tile_net_info *info = &per_cpu(per_cpu_info, tx_queue_idx); 744 struct tile_net_priv *priv = netdev_priv(dev); 745 int instance = priv->instance; 746 struct tile_net_tx_wake *tx_wake = 747 &info->mpipe[instance].tx_wake[priv->echannel]; 748 749 hrtimer_start(&tx_wake->timer, 750 ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL), 751 HRTIMER_MODE_REL_PINNED); 752} 753 754static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t) 755{ 756 struct tile_net_tx_wake *tx_wake = 757 container_of(t, struct tile_net_tx_wake, timer); 758 netif_wake_subqueue(tx_wake->dev, tx_wake->tx_queue_idx); 759 return HRTIMER_NORESTART; 760} 761 762/* Make sure the egress timer is scheduled. */ 763static void tile_net_schedule_egress_timer(void) 764{ 765 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 766 767 if (!info->egress_timer_scheduled) { 768 hrtimer_start(&info->egress_timer, 769 ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL), 770 HRTIMER_MODE_REL_PINNED); 771 info->egress_timer_scheduled = true; 772 } 773} 774 775/* The "function" for "info->egress_timer". 776 * 777 * This timer will reschedule itself as long as there are any pending 778 * completions expected for this tile. 779 */ 780static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t) 781{ 782 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 783 unsigned long irqflags; 784 bool pending = false; 785 int i, instance; 786 787 local_irq_save(irqflags); 788 789 /* The timer is no longer scheduled. */ 790 info->egress_timer_scheduled = false; 791 792 /* Free all possible comps for this tile. */ 793 for (instance = 0; instance < NR_MPIPE_MAX && 794 info->mpipe[instance].has_iqueue; instance++) { 795 for (i = 0; i < TILE_NET_CHANNELS; i++) { 796 struct tile_net_egress *egress = 797 &mpipe_data[instance].egress_for_echannel[i]; 798 struct tile_net_comps *comps = 799 info->mpipe[instance].comps_for_echannel[i]; 800 if (!egress || comps->comp_last >= comps->comp_next) 801 continue; 802 tile_net_free_comps(egress->equeue, comps, -1, true); 803 pending = pending || 804 (comps->comp_last < comps->comp_next); 805 } 806 } 807 808 /* Reschedule timer if needed. */ 809 if (pending) 810 tile_net_schedule_egress_timer(); 811 812 local_irq_restore(irqflags); 813 814 return HRTIMER_NORESTART; 815} 816 817/* PTP clock operations. */ 818 819static int ptp_mpipe_adjfreq(struct ptp_clock_info *ptp, s32 ppb) 820{ 821 int ret = 0; 822 struct mpipe_data *md = container_of(ptp, struct mpipe_data, caps); 823 mutex_lock(&md->ptp_lock); 824 if (gxio_mpipe_adjust_timestamp_freq(&md->context, ppb)) 825 ret = -EINVAL; 826 mutex_unlock(&md->ptp_lock); 827 return ret; 828} 829 830static int ptp_mpipe_adjtime(struct ptp_clock_info *ptp, s64 delta) 831{ 832 int ret = 0; 833 struct mpipe_data *md = container_of(ptp, struct mpipe_data, caps); 834 mutex_lock(&md->ptp_lock); 835 if (gxio_mpipe_adjust_timestamp(&md->context, delta)) 836 ret = -EBUSY; 837 mutex_unlock(&md->ptp_lock); 838 return ret; 839} 840 841static int ptp_mpipe_gettime(struct ptp_clock_info *ptp, 842 struct timespec64 *ts) 843{ 844 int ret = 0; 845 struct mpipe_data *md = container_of(ptp, struct mpipe_data, caps); 846 mutex_lock(&md->ptp_lock); 847 if (gxio_mpipe_get_timestamp(&md->context, ts)) 848 ret = -EBUSY; 849 mutex_unlock(&md->ptp_lock); 850 return ret; 851} 852 853static int ptp_mpipe_settime(struct ptp_clock_info *ptp, 854 const struct timespec64 *ts) 855{ 856 int ret = 0; 857 struct mpipe_data *md = container_of(ptp, struct mpipe_data, caps); 858 mutex_lock(&md->ptp_lock); 859 if (gxio_mpipe_set_timestamp(&md->context, ts)) 860 ret = -EBUSY; 861 mutex_unlock(&md->ptp_lock); 862 return ret; 863} 864 865static int ptp_mpipe_enable(struct ptp_clock_info *ptp, 866 struct ptp_clock_request *request, int on) 867{ 868 return -EOPNOTSUPP; 869} 870 871static struct ptp_clock_info ptp_mpipe_caps = { 872 .owner = THIS_MODULE, 873 .name = "mPIPE clock", 874 .max_adj = 999999999, 875 .n_ext_ts = 0, 876 .n_pins = 0, 877 .pps = 0, 878 .adjfreq = ptp_mpipe_adjfreq, 879 .adjtime = ptp_mpipe_adjtime, 880 .gettime64 = ptp_mpipe_gettime, 881 .settime64 = ptp_mpipe_settime, 882 .enable = ptp_mpipe_enable, 883}; 884 885/* Sync mPIPE's timestamp up with Linux system time and register PTP clock. */ 886static void register_ptp_clock(struct net_device *dev, struct mpipe_data *md) 887{ 888 struct timespec ts; 889 890 getnstimeofday(&ts); 891 gxio_mpipe_set_timestamp(&md->context, &ts); 892 893 mutex_init(&md->ptp_lock); 894 md->caps = ptp_mpipe_caps; 895 md->ptp_clock = ptp_clock_register(&md->caps, NULL); 896 if (IS_ERR(md->ptp_clock)) 897 netdev_err(dev, "ptp_clock_register failed %ld\n", 898 PTR_ERR(md->ptp_clock)); 899} 900 901/* Initialize PTP fields in a new device. */ 902static void init_ptp_dev(struct tile_net_priv *priv) 903{ 904 priv->stamp_cfg.rx_filter = HWTSTAMP_FILTER_NONE; 905 priv->stamp_cfg.tx_type = HWTSTAMP_TX_OFF; 906} 907 908/* Helper functions for "tile_net_update()". */ 909static void enable_ingress_irq(void *irq) 910{ 911 enable_percpu_irq((long)irq, 0); 912} 913 914static void disable_ingress_irq(void *irq) 915{ 916 disable_percpu_irq((long)irq); 917} 918 919/* Helper function for tile_net_open() and tile_net_stop(). 920 * Always called under tile_net_devs_for_channel_mutex. 921 */ 922static int tile_net_update(struct net_device *dev) 923{ 924 static gxio_mpipe_rules_t rules; /* too big to fit on the stack */ 925 bool saw_channel = false; 926 int instance = mpipe_instance(dev); 927 struct mpipe_data *md = &mpipe_data[instance]; 928 int channel; 929 int rc; 930 int cpu; 931 932 saw_channel = false; 933 gxio_mpipe_rules_init(&rules, &md->context); 934 935 for (channel = 0; channel < TILE_NET_CHANNELS; channel++) { 936 if (md->tile_net_devs_for_channel[channel] == NULL) 937 continue; 938 if (!saw_channel) { 939 saw_channel = true; 940 gxio_mpipe_rules_begin(&rules, md->first_bucket, 941 md->num_buckets, NULL); 942 gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN); 943 } 944 gxio_mpipe_rules_add_channel(&rules, channel); 945 } 946 947 /* NOTE: This can fail if there is no classifier. 948 * ISSUE: Can anything else cause it to fail? 949 */ 950 rc = gxio_mpipe_rules_commit(&rules); 951 if (rc != 0) { 952 netdev_warn(dev, "gxio_mpipe_rules_commit: mpipe[%d] %d\n", 953 instance, rc); 954 return -EIO; 955 } 956 957 /* Update all cpus, sequentially (to protect "netif_napi_add()"). 958 * We use on_each_cpu to handle the IPI mask or unmask. 959 */ 960 if (!saw_channel) 961 on_each_cpu(disable_ingress_irq, 962 (void *)(long)(md->ingress_irq), 1); 963 for_each_online_cpu(cpu) { 964 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 965 966 if (!info->mpipe[instance].has_iqueue) 967 continue; 968 if (saw_channel) { 969 if (!info->mpipe[instance].napi_added) { 970 netif_napi_add(dev, &info->mpipe[instance].napi, 971 tile_net_poll, TILE_NET_WEIGHT); 972 info->mpipe[instance].napi_added = true; 973 } 974 if (!info->mpipe[instance].napi_enabled) { 975 napi_enable(&info->mpipe[instance].napi); 976 info->mpipe[instance].napi_enabled = true; 977 } 978 } else { 979 if (info->mpipe[instance].napi_enabled) { 980 napi_disable(&info->mpipe[instance].napi); 981 info->mpipe[instance].napi_enabled = false; 982 } 983 /* FIXME: Drain the iqueue. */ 984 } 985 } 986 if (saw_channel) 987 on_each_cpu(enable_ingress_irq, 988 (void *)(long)(md->ingress_irq), 1); 989 990 /* HACK: Allow packets to flow in the simulator. */ 991 if (saw_channel) 992 sim_enable_mpipe_links(instance, -1); 993 994 return 0; 995} 996 997/* Initialize a buffer stack. */ 998static int create_buffer_stack(struct net_device *dev, 999 int kind, size_t num_buffers) 1000{ 1001 pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH); 1002 int instance = mpipe_instance(dev); 1003 struct mpipe_data *md = &mpipe_data[instance]; 1004 size_t needed = gxio_mpipe_calc_buffer_stack_bytes(num_buffers); 1005 int stack_idx = md->first_buffer_stack + kind; 1006 void *va; 1007 int i, rc; 1008 1009 /* Round up to 64KB and then use alloc_pages() so we get the 1010 * required 64KB alignment. 1011 */ 1012 md->buffer_stack_bytes[kind] = 1013 ALIGN(needed, 64 * 1024); 1014 1015 va = alloc_pages_exact(md->buffer_stack_bytes[kind], GFP_KERNEL); 1016 if (va == NULL) { 1017 netdev_err(dev, 1018 "Could not alloc %zd bytes for buffer stack %d\n", 1019 md->buffer_stack_bytes[kind], kind); 1020 return -ENOMEM; 1021 } 1022 1023 /* Initialize the buffer stack. */ 1024 rc = gxio_mpipe_init_buffer_stack(&md->context, stack_idx, 1025 buffer_size_enums[kind], va, 1026 md->buffer_stack_bytes[kind], 0); 1027 if (rc != 0) { 1028 netdev_err(dev, "gxio_mpipe_init_buffer_stack: mpipe[%d] %d\n", 1029 instance, rc); 1030 free_pages_exact(va, md->buffer_stack_bytes[kind]); 1031 return rc; 1032 } 1033 1034 md->buffer_stack_vas[kind] = va; 1035 1036 rc = gxio_mpipe_register_client_memory(&md->context, stack_idx, 1037 hash_pte, 0); 1038 if (rc != 0) { 1039 netdev_err(dev, 1040 "gxio_mpipe_register_client_memory: mpipe[%d] %d\n", 1041 instance, rc); 1042 return rc; 1043 } 1044 1045 /* Provide initial buffers. */ 1046 for (i = 0; i < num_buffers; i++) { 1047 if (!tile_net_provide_buffer(instance, kind)) { 1048 netdev_err(dev, "Cannot allocate initial sk_bufs!\n"); 1049 return -ENOMEM; 1050 } 1051 } 1052 1053 return 0; 1054} 1055 1056/* Allocate and initialize mpipe buffer stacks, and register them in 1057 * the mPIPE TLBs, for small, large, and (possibly) jumbo packet sizes. 1058 * This routine supports tile_net_init_mpipe(), below. 1059 */ 1060static int init_buffer_stacks(struct net_device *dev, 1061 int network_cpus_count) 1062{ 1063 int num_kinds = MAX_KINDS - (jumbo_num == 0); 1064 size_t num_buffers; 1065 int rc; 1066 int instance = mpipe_instance(dev); 1067 struct mpipe_data *md = &mpipe_data[instance]; 1068 1069 /* Allocate the buffer stacks. */ 1070 rc = gxio_mpipe_alloc_buffer_stacks(&md->context, num_kinds, 0, 0); 1071 if (rc < 0) { 1072 netdev_err(dev, 1073 "gxio_mpipe_alloc_buffer_stacks: mpipe[%d] %d\n", 1074 instance, rc); 1075 return rc; 1076 } 1077 md->first_buffer_stack = rc; 1078 1079 /* Enough small/large buffers to (normally) avoid buffer errors. */ 1080 num_buffers = 1081 network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH); 1082 1083 /* Allocate the small memory stack. */ 1084 if (rc >= 0) 1085 rc = create_buffer_stack(dev, 0, num_buffers); 1086 1087 /* Allocate the large buffer stack. */ 1088 if (rc >= 0) 1089 rc = create_buffer_stack(dev, 1, num_buffers); 1090 1091 /* Allocate the jumbo buffer stack if needed. */ 1092 if (rc >= 0 && jumbo_num != 0) 1093 rc = create_buffer_stack(dev, 2, jumbo_num); 1094 1095 return rc; 1096} 1097 1098/* Allocate per-cpu resources (memory for completions and idescs). 1099 * This routine supports tile_net_init_mpipe(), below. 1100 */ 1101static int alloc_percpu_mpipe_resources(struct net_device *dev, 1102 int cpu, int ring) 1103{ 1104 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 1105 int order, i, rc; 1106 int instance = mpipe_instance(dev); 1107 struct mpipe_data *md = &mpipe_data[instance]; 1108 struct page *page; 1109 void *addr; 1110 1111 /* Allocate the "comps". */ 1112 order = get_order(COMPS_SIZE); 1113 page = homecache_alloc_pages(GFP_KERNEL, order, cpu); 1114 if (page == NULL) { 1115 netdev_err(dev, "Failed to alloc %zd bytes comps memory\n", 1116 COMPS_SIZE); 1117 return -ENOMEM; 1118 } 1119 addr = pfn_to_kaddr(page_to_pfn(page)); 1120 memset(addr, 0, COMPS_SIZE); 1121 for (i = 0; i < TILE_NET_CHANNELS; i++) 1122 info->mpipe[instance].comps_for_echannel[i] = 1123 addr + i * sizeof(struct tile_net_comps); 1124 1125 /* If this is a network cpu, create an iqueue. */ 1126 if (cpumask_test_cpu(cpu, &network_cpus_map)) { 1127 order = get_order(NOTIF_RING_SIZE); 1128 page = homecache_alloc_pages(GFP_KERNEL, order, cpu); 1129 if (page == NULL) { 1130 netdev_err(dev, 1131 "Failed to alloc %zd bytes iqueue memory\n", 1132 NOTIF_RING_SIZE); 1133 return -ENOMEM; 1134 } 1135 addr = pfn_to_kaddr(page_to_pfn(page)); 1136 rc = gxio_mpipe_iqueue_init(&info->mpipe[instance].iqueue, 1137 &md->context, ring++, addr, 1138 NOTIF_RING_SIZE, 0); 1139 if (rc < 0) { 1140 netdev_err(dev, 1141 "gxio_mpipe_iqueue_init failed: %d\n", rc); 1142 return rc; 1143 } 1144 info->mpipe[instance].has_iqueue = true; 1145 } 1146 1147 return ring; 1148} 1149 1150/* Initialize NotifGroup and buckets. 1151 * This routine supports tile_net_init_mpipe(), below. 1152 */ 1153static int init_notif_group_and_buckets(struct net_device *dev, 1154 int ring, int network_cpus_count) 1155{ 1156 int group, rc; 1157 int instance = mpipe_instance(dev); 1158 struct mpipe_data *md = &mpipe_data[instance]; 1159 1160 /* Allocate one NotifGroup. */ 1161 rc = gxio_mpipe_alloc_notif_groups(&md->context, 1, 0, 0); 1162 if (rc < 0) { 1163 netdev_err(dev, "gxio_mpipe_alloc_notif_groups: mpipe[%d] %d\n", 1164 instance, rc); 1165 return rc; 1166 } 1167 group = rc; 1168 1169 /* Initialize global num_buckets value. */ 1170 if (network_cpus_count > 4) 1171 md->num_buckets = 256; 1172 else if (network_cpus_count > 1) 1173 md->num_buckets = 16; 1174 1175 /* Allocate some buckets, and set global first_bucket value. */ 1176 rc = gxio_mpipe_alloc_buckets(&md->context, md->num_buckets, 0, 0); 1177 if (rc < 0) { 1178 netdev_err(dev, "gxio_mpipe_alloc_buckets: mpipe[%d] %d\n", 1179 instance, rc); 1180 return rc; 1181 } 1182 md->first_bucket = rc; 1183 1184 /* Init group and buckets. */ 1185 rc = gxio_mpipe_init_notif_group_and_buckets( 1186 &md->context, group, ring, network_cpus_count, 1187 md->first_bucket, md->num_buckets, 1188 GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY); 1189 if (rc != 0) { 1190 netdev_err(dev, "gxio_mpipe_init_notif_group_and_buckets: " 1191 "mpipe[%d] %d\n", instance, rc); 1192 return rc; 1193 } 1194 1195 return 0; 1196} 1197 1198/* Create an irq and register it, then activate the irq and request 1199 * interrupts on all cores. Note that "ingress_irq" being initialized 1200 * is how we know not to call tile_net_init_mpipe() again. 1201 * This routine supports tile_net_init_mpipe(), below. 1202 */ 1203static int tile_net_setup_interrupts(struct net_device *dev) 1204{ 1205 int cpu, rc, irq; 1206 int instance = mpipe_instance(dev); 1207 struct mpipe_data *md = &mpipe_data[instance]; 1208 1209 irq = md->ingress_irq; 1210 if (irq < 0) { 1211 irq = irq_alloc_hwirq(-1); 1212 if (!irq) { 1213 netdev_err(dev, 1214 "create_irq failed: mpipe[%d] %d\n", 1215 instance, irq); 1216 return irq; 1217 } 1218 tile_irq_activate(irq, TILE_IRQ_PERCPU); 1219 1220 rc = request_irq(irq, tile_net_handle_ingress_irq, 1221 0, "tile_net", (void *)((uint64_t)instance)); 1222 1223 if (rc != 0) { 1224 netdev_err(dev, "request_irq failed: mpipe[%d] %d\n", 1225 instance, rc); 1226 irq_free_hwirq(irq); 1227 return rc; 1228 } 1229 md->ingress_irq = irq; 1230 } 1231 1232 for_each_online_cpu(cpu) { 1233 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 1234 if (info->mpipe[instance].has_iqueue) { 1235 gxio_mpipe_request_notif_ring_interrupt(&md->context, 1236 cpu_x(cpu), cpu_y(cpu), KERNEL_PL, irq, 1237 info->mpipe[instance].iqueue.ring); 1238 } 1239 } 1240 1241 return 0; 1242} 1243 1244/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */ 1245static void tile_net_init_mpipe_fail(int instance) 1246{ 1247 int kind, cpu; 1248 struct mpipe_data *md = &mpipe_data[instance]; 1249 1250 /* Do cleanups that require the mpipe context first. */ 1251 for (kind = 0; kind < MAX_KINDS; kind++) { 1252 if (md->buffer_stack_vas[kind] != NULL) { 1253 tile_net_pop_all_buffers(instance, 1254 md->first_buffer_stack + 1255 kind); 1256 } 1257 } 1258 1259 /* Destroy mpipe context so the hardware no longer owns any memory. */ 1260 gxio_mpipe_destroy(&md->context); 1261 1262 for_each_online_cpu(cpu) { 1263 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 1264 free_pages( 1265 (unsigned long)( 1266 info->mpipe[instance].comps_for_echannel[0]), 1267 get_order(COMPS_SIZE)); 1268 info->mpipe[instance].comps_for_echannel[0] = NULL; 1269 free_pages((unsigned long)(info->mpipe[instance].iqueue.idescs), 1270 get_order(NOTIF_RING_SIZE)); 1271 info->mpipe[instance].iqueue.idescs = NULL; 1272 } 1273 1274 for (kind = 0; kind < MAX_KINDS; kind++) { 1275 if (md->buffer_stack_vas[kind] != NULL) { 1276 free_pages_exact(md->buffer_stack_vas[kind], 1277 md->buffer_stack_bytes[kind]); 1278 md->buffer_stack_vas[kind] = NULL; 1279 } 1280 } 1281 1282 md->first_buffer_stack = -1; 1283 md->first_bucket = -1; 1284} 1285 1286/* The first time any tilegx network device is opened, we initialize 1287 * the global mpipe state. If this step fails, we fail to open the 1288 * device, but if it succeeds, we never need to do it again, and since 1289 * tile_net can't be unloaded, we never undo it. 1290 * 1291 * Note that some resources in this path (buffer stack indices, 1292 * bindings from init_buffer_stack, etc.) are hypervisor resources 1293 * that are freed implicitly by gxio_mpipe_destroy(). 1294 */ 1295static int tile_net_init_mpipe(struct net_device *dev) 1296{ 1297 int rc; 1298 int cpu; 1299 int first_ring, ring; 1300 int instance = mpipe_instance(dev); 1301 struct mpipe_data *md = &mpipe_data[instance]; 1302 int network_cpus_count = cpumask_weight(&network_cpus_map); 1303 1304 if (!hash_default) { 1305 netdev_err(dev, "Networking requires hash_default!\n"); 1306 return -EIO; 1307 } 1308 1309 rc = gxio_mpipe_init(&md->context, instance); 1310 if (rc != 0) { 1311 netdev_err(dev, "gxio_mpipe_init: mpipe[%d] %d\n", 1312 instance, rc); 1313 return -EIO; 1314 } 1315 1316 /* Set up the buffer stacks. */ 1317 rc = init_buffer_stacks(dev, network_cpus_count); 1318 if (rc != 0) 1319 goto fail; 1320 1321 /* Allocate one NotifRing for each network cpu. */ 1322 rc = gxio_mpipe_alloc_notif_rings(&md->context, 1323 network_cpus_count, 0, 0); 1324 if (rc < 0) { 1325 netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n", 1326 rc); 1327 goto fail; 1328 } 1329 1330 /* Init NotifRings per-cpu. */ 1331 first_ring = rc; 1332 ring = first_ring; 1333 for_each_online_cpu(cpu) { 1334 rc = alloc_percpu_mpipe_resources(dev, cpu, ring); 1335 if (rc < 0) 1336 goto fail; 1337 ring = rc; 1338 } 1339 1340 /* Initialize NotifGroup and buckets. */ 1341 rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count); 1342 if (rc != 0) 1343 goto fail; 1344 1345 /* Create and enable interrupts. */ 1346 rc = tile_net_setup_interrupts(dev); 1347 if (rc != 0) 1348 goto fail; 1349 1350 /* Register PTP clock and set mPIPE timestamp, if configured. */ 1351 register_ptp_clock(dev, md); 1352 1353 return 0; 1354 1355fail: 1356 tile_net_init_mpipe_fail(instance); 1357 return rc; 1358} 1359 1360/* Create persistent egress info for a given egress channel. 1361 * Note that this may be shared between, say, "gbe0" and "xgbe0". 1362 * ISSUE: Defer header allocation until TSO is actually needed? 1363 */ 1364static int tile_net_init_egress(struct net_device *dev, int echannel) 1365{ 1366 static int ering = -1; 1367 struct page *headers_page, *edescs_page, *equeue_page; 1368 gxio_mpipe_edesc_t *edescs; 1369 gxio_mpipe_equeue_t *equeue; 1370 unsigned char *headers; 1371 int headers_order, edescs_order, equeue_order; 1372 size_t edescs_size; 1373 int rc = -ENOMEM; 1374 int instance = mpipe_instance(dev); 1375 struct mpipe_data *md = &mpipe_data[instance]; 1376 1377 /* Only initialize once. */ 1378 if (md->egress_for_echannel[echannel].equeue != NULL) 1379 return 0; 1380 1381 /* Allocate memory for the "headers". */ 1382 headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES); 1383 headers_page = alloc_pages(GFP_KERNEL, headers_order); 1384 if (headers_page == NULL) { 1385 netdev_warn(dev, 1386 "Could not alloc %zd bytes for TSO headers.\n", 1387 PAGE_SIZE << headers_order); 1388 goto fail; 1389 } 1390 headers = pfn_to_kaddr(page_to_pfn(headers_page)); 1391 1392 /* Allocate memory for the "edescs". */ 1393 edescs_size = EQUEUE_ENTRIES * sizeof(*edescs); 1394 edescs_order = get_order(edescs_size); 1395 edescs_page = alloc_pages(GFP_KERNEL, edescs_order); 1396 if (edescs_page == NULL) { 1397 netdev_warn(dev, 1398 "Could not alloc %zd bytes for eDMA ring.\n", 1399 edescs_size); 1400 goto fail_headers; 1401 } 1402 edescs = pfn_to_kaddr(page_to_pfn(edescs_page)); 1403 1404 /* Allocate memory for the "equeue". */ 1405 equeue_order = get_order(sizeof(*equeue)); 1406 equeue_page = alloc_pages(GFP_KERNEL, equeue_order); 1407 if (equeue_page == NULL) { 1408 netdev_warn(dev, 1409 "Could not alloc %zd bytes for equeue info.\n", 1410 PAGE_SIZE << equeue_order); 1411 goto fail_edescs; 1412 } 1413 equeue = pfn_to_kaddr(page_to_pfn(equeue_page)); 1414 1415 /* Allocate an edma ring (using a one entry "free list"). */ 1416 if (ering < 0) { 1417 rc = gxio_mpipe_alloc_edma_rings(&md->context, 1, 0, 0); 1418 if (rc < 0) { 1419 netdev_warn(dev, "gxio_mpipe_alloc_edma_rings: " 1420 "mpipe[%d] %d\n", instance, rc); 1421 goto fail_equeue; 1422 } 1423 ering = rc; 1424 } 1425 1426 /* Initialize the equeue. */ 1427 rc = gxio_mpipe_equeue_init(equeue, &md->context, ering, echannel, 1428 edescs, edescs_size, 0); 1429 if (rc != 0) { 1430 netdev_err(dev, "gxio_mpipe_equeue_init: mpipe[%d] %d\n", 1431 instance, rc); 1432 goto fail_equeue; 1433 } 1434 1435 /* Don't reuse the ering later. */ 1436 ering = -1; 1437 1438 if (jumbo_num != 0) { 1439 /* Make sure "jumbo" packets can be egressed safely. */ 1440 if (gxio_mpipe_equeue_set_snf_size(equeue, 10368) < 0) { 1441 /* ISSUE: There is no "gxio_mpipe_equeue_destroy()". */ 1442 netdev_warn(dev, "Jumbo packets may not be egressed" 1443 " properly on channel %d\n", echannel); 1444 } 1445 } 1446 1447 /* Done. */ 1448 md->egress_for_echannel[echannel].equeue = equeue; 1449 md->egress_for_echannel[echannel].headers = headers; 1450 return 0; 1451 1452fail_equeue: 1453 __free_pages(equeue_page, equeue_order); 1454 1455fail_edescs: 1456 __free_pages(edescs_page, edescs_order); 1457 1458fail_headers: 1459 __free_pages(headers_page, headers_order); 1460 1461fail: 1462 return rc; 1463} 1464 1465/* Return channel number for a newly-opened link. */ 1466static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link, 1467 const char *link_name) 1468{ 1469 int instance = mpipe_instance(dev); 1470 struct mpipe_data *md = &mpipe_data[instance]; 1471 int rc = gxio_mpipe_link_open(link, &md->context, link_name, 0); 1472 if (rc < 0) { 1473 netdev_err(dev, "Failed to open '%s', mpipe[%d], %d\n", 1474 link_name, instance, rc); 1475 return rc; 1476 } 1477 if (jumbo_num != 0) { 1478 u32 attr = GXIO_MPIPE_LINK_RECEIVE_JUMBO; 1479 rc = gxio_mpipe_link_set_attr(link, attr, 1); 1480 if (rc != 0) { 1481 netdev_err(dev, 1482 "Cannot receive jumbo packets on '%s'\n", 1483 link_name); 1484 gxio_mpipe_link_close(link); 1485 return rc; 1486 } 1487 } 1488 rc = gxio_mpipe_link_channel(link); 1489 if (rc < 0 || rc >= TILE_NET_CHANNELS) { 1490 netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc); 1491 gxio_mpipe_link_close(link); 1492 return -EINVAL; 1493 } 1494 return rc; 1495} 1496 1497/* Help the kernel activate the given network interface. */ 1498static int tile_net_open(struct net_device *dev) 1499{ 1500 struct tile_net_priv *priv = netdev_priv(dev); 1501 int cpu, rc, instance; 1502 1503 mutex_lock(&tile_net_devs_for_channel_mutex); 1504 1505 /* Get the instance info. */ 1506 rc = gxio_mpipe_link_instance(dev->name); 1507 if (rc < 0 || rc >= NR_MPIPE_MAX) { 1508 mutex_unlock(&tile_net_devs_for_channel_mutex); 1509 return -EIO; 1510 } 1511 1512 priv->instance = rc; 1513 instance = rc; 1514 if (!mpipe_data[rc].context.mmio_fast_base) { 1515 /* Do one-time initialization per instance the first time 1516 * any device is opened. 1517 */ 1518 rc = tile_net_init_mpipe(dev); 1519 if (rc != 0) 1520 goto fail; 1521 } 1522 1523 /* Determine if this is the "loopify" device. */ 1524 if (unlikely((loopify_link_name != NULL) && 1525 !strcmp(dev->name, loopify_link_name))) { 1526 rc = tile_net_link_open(dev, &priv->link, "loop0"); 1527 if (rc < 0) 1528 goto fail; 1529 priv->channel = rc; 1530 rc = tile_net_link_open(dev, &priv->loopify_link, "loop1"); 1531 if (rc < 0) 1532 goto fail; 1533 priv->loopify_channel = rc; 1534 priv->echannel = rc; 1535 } else { 1536 rc = tile_net_link_open(dev, &priv->link, dev->name); 1537 if (rc < 0) 1538 goto fail; 1539 priv->channel = rc; 1540 priv->echannel = rc; 1541 } 1542 1543 /* Initialize egress info (if needed). Once ever, per echannel. */ 1544 rc = tile_net_init_egress(dev, priv->echannel); 1545 if (rc != 0) 1546 goto fail; 1547 1548 mpipe_data[instance].tile_net_devs_for_channel[priv->channel] = dev; 1549 1550 rc = tile_net_update(dev); 1551 if (rc != 0) 1552 goto fail; 1553 1554 mutex_unlock(&tile_net_devs_for_channel_mutex); 1555 1556 /* Initialize the transmit wake timer for this device for each cpu. */ 1557 for_each_online_cpu(cpu) { 1558 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 1559 struct tile_net_tx_wake *tx_wake = 1560 &info->mpipe[instance].tx_wake[priv->echannel]; 1561 1562 hrtimer_init(&tx_wake->timer, CLOCK_MONOTONIC, 1563 HRTIMER_MODE_REL); 1564 tx_wake->tx_queue_idx = cpu; 1565 tx_wake->timer.function = tile_net_handle_tx_wake_timer; 1566 tx_wake->dev = dev; 1567 } 1568 1569 for_each_online_cpu(cpu) 1570 netif_start_subqueue(dev, cpu); 1571 netif_carrier_on(dev); 1572 return 0; 1573 1574fail: 1575 if (priv->loopify_channel >= 0) { 1576 if (gxio_mpipe_link_close(&priv->loopify_link) != 0) 1577 netdev_warn(dev, "Failed to close loopify link!\n"); 1578 priv->loopify_channel = -1; 1579 } 1580 if (priv->channel >= 0) { 1581 if (gxio_mpipe_link_close(&priv->link) != 0) 1582 netdev_warn(dev, "Failed to close link!\n"); 1583 priv->channel = -1; 1584 } 1585 priv->echannel = -1; 1586 mpipe_data[instance].tile_net_devs_for_channel[priv->channel] = NULL; 1587 mutex_unlock(&tile_net_devs_for_channel_mutex); 1588 1589 /* Don't return raw gxio error codes to generic Linux. */ 1590 return (rc > -512) ? rc : -EIO; 1591} 1592 1593/* Help the kernel deactivate the given network interface. */ 1594static int tile_net_stop(struct net_device *dev) 1595{ 1596 struct tile_net_priv *priv = netdev_priv(dev); 1597 int cpu; 1598 int instance = priv->instance; 1599 struct mpipe_data *md = &mpipe_data[instance]; 1600 1601 for_each_online_cpu(cpu) { 1602 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu); 1603 struct tile_net_tx_wake *tx_wake = 1604 &info->mpipe[instance].tx_wake[priv->echannel]; 1605 1606 hrtimer_cancel(&tx_wake->timer); 1607 netif_stop_subqueue(dev, cpu); 1608 } 1609 1610 mutex_lock(&tile_net_devs_for_channel_mutex); 1611 md->tile_net_devs_for_channel[priv->channel] = NULL; 1612 (void)tile_net_update(dev); 1613 if (priv->loopify_channel >= 0) { 1614 if (gxio_mpipe_link_close(&priv->loopify_link) != 0) 1615 netdev_warn(dev, "Failed to close loopify link!\n"); 1616 priv->loopify_channel = -1; 1617 } 1618 if (priv->channel >= 0) { 1619 if (gxio_mpipe_link_close(&priv->link) != 0) 1620 netdev_warn(dev, "Failed to close link!\n"); 1621 priv->channel = -1; 1622 } 1623 priv->echannel = -1; 1624 mutex_unlock(&tile_net_devs_for_channel_mutex); 1625 1626 return 0; 1627} 1628 1629/* Determine the VA for a fragment. */ 1630static inline void *tile_net_frag_buf(skb_frag_t *f) 1631{ 1632 unsigned long pfn = page_to_pfn(skb_frag_page(f)); 1633 return pfn_to_kaddr(pfn) + f->page_offset; 1634} 1635 1636/* Acquire a completion entry and an egress slot, or if we can't, 1637 * stop the queue and schedule the tx_wake timer. 1638 */ 1639static s64 tile_net_equeue_try_reserve(struct net_device *dev, 1640 int tx_queue_idx, 1641 struct tile_net_comps *comps, 1642 gxio_mpipe_equeue_t *equeue, 1643 int num_edescs) 1644{ 1645 /* Try to acquire a completion entry. */ 1646 if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 || 1647 tile_net_free_comps(equeue, comps, 32, false) != 0) { 1648 1649 /* Try to acquire an egress slot. */ 1650 s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs); 1651 if (slot >= 0) 1652 return slot; 1653 1654 /* Freeing some completions gives the equeue time to drain. */ 1655 tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false); 1656 1657 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs); 1658 if (slot >= 0) 1659 return slot; 1660 } 1661 1662 /* Still nothing; give up and stop the queue for a short while. */ 1663 netif_stop_subqueue(dev, tx_queue_idx); 1664 tile_net_schedule_tx_wake_timer(dev, tx_queue_idx); 1665 return -1; 1666} 1667 1668/* Determine how many edesc's are needed for TSO. 1669 * 1670 * Sometimes, if "sendfile()" requires copying, we will be called with 1671 * "data" containing the header and payload, with "frags" being empty. 1672 * Sometimes, for example when using NFS over TCP, a single segment can 1673 * span 3 fragments. This requires special care. 1674 */ 1675static int tso_count_edescs(struct sk_buff *skb) 1676{ 1677 struct skb_shared_info *sh = skb_shinfo(skb); 1678 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb); 1679 unsigned int data_len = skb->len - sh_len; 1680 unsigned int p_len = sh->gso_size; 1681 long f_id = -1; /* id of the current fragment */ 1682 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */ 1683 long f_used = 0; /* bytes used from the current fragment */ 1684 long n; /* size of the current piece of payload */ 1685 int num_edescs = 0; 1686 int segment; 1687 1688 for (segment = 0; segment < sh->gso_segs; segment++) { 1689 1690 unsigned int p_used = 0; 1691 1692 /* One edesc for header and for each piece of the payload. */ 1693 for (num_edescs++; p_used < p_len; num_edescs++) { 1694 1695 /* Advance as needed. */ 1696 while (f_used >= f_size) { 1697 f_id++; 1698 f_size = skb_frag_size(&sh->frags[f_id]); 1699 f_used = 0; 1700 } 1701 1702 /* Use bytes from the current fragment. */ 1703 n = p_len - p_used; 1704 if (n > f_size - f_used) 1705 n = f_size - f_used; 1706 f_used += n; 1707 p_used += n; 1708 } 1709 1710 /* The last segment may be less than gso_size. */ 1711 data_len -= p_len; 1712 if (data_len < p_len) 1713 p_len = data_len; 1714 } 1715 1716 return num_edescs; 1717} 1718 1719/* Prepare modified copies of the skbuff headers. */ 1720static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers, 1721 s64 slot) 1722{ 1723 struct skb_shared_info *sh = skb_shinfo(skb); 1724 struct iphdr *ih; 1725 struct ipv6hdr *ih6; 1726 struct tcphdr *th; 1727 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb); 1728 unsigned int data_len = skb->len - sh_len; 1729 unsigned char *data = skb->data; 1730 unsigned int ih_off, th_off, p_len; 1731 unsigned int isum_seed, tsum_seed, seq; 1732 unsigned int uninitialized_var(id); 1733 int is_ipv6; 1734 long f_id = -1; /* id of the current fragment */ 1735 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */ 1736 long f_used = 0; /* bytes used from the current fragment */ 1737 long n; /* size of the current piece of payload */ 1738 int segment; 1739 1740 /* Locate original headers and compute various lengths. */ 1741 is_ipv6 = skb_is_gso_v6(skb); 1742 if (is_ipv6) { 1743 ih6 = ipv6_hdr(skb); 1744 ih_off = skb_network_offset(skb); 1745 } else { 1746 ih = ip_hdr(skb); 1747 ih_off = skb_network_offset(skb); 1748 isum_seed = ((0xFFFF - ih->check) + 1749 (0xFFFF - ih->tot_len) + 1750 (0xFFFF - ih->id)); 1751 id = ntohs(ih->id); 1752 } 1753 1754 th = tcp_hdr(skb); 1755 th_off = skb_transport_offset(skb); 1756 p_len = sh->gso_size; 1757 1758 tsum_seed = th->check + (0xFFFF ^ htons(skb->len)); 1759 seq = ntohl(th->seq); 1760 1761 /* Prepare all the headers. */ 1762 for (segment = 0; segment < sh->gso_segs; segment++) { 1763 unsigned char *buf; 1764 unsigned int p_used = 0; 1765 1766 /* Copy to the header memory for this segment. */ 1767 buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES + 1768 NET_IP_ALIGN; 1769 memcpy(buf, data, sh_len); 1770 1771 /* Update copied ip header. */ 1772 if (is_ipv6) { 1773 ih6 = (struct ipv6hdr *)(buf + ih_off); 1774 ih6->payload_len = htons(sh_len + p_len - ih_off - 1775 sizeof(*ih6)); 1776 } else { 1777 ih = (struct iphdr *)(buf + ih_off); 1778 ih->tot_len = htons(sh_len + p_len - ih_off); 1779 ih->id = htons(id++); 1780 ih->check = csum_long(isum_seed + ih->tot_len + 1781 ih->id) ^ 0xffff; 1782 } 1783 1784 /* Update copied tcp header. */ 1785 th = (struct tcphdr *)(buf + th_off); 1786 th->seq = htonl(seq); 1787 th->check = csum_long(tsum_seed + htons(sh_len + p_len)); 1788 if (segment != sh->gso_segs - 1) { 1789 th->fin = 0; 1790 th->psh = 0; 1791 } 1792 1793 /* Skip past the header. */ 1794 slot++; 1795 1796 /* Skip past the payload. */ 1797 while (p_used < p_len) { 1798 1799 /* Advance as needed. */ 1800 while (f_used >= f_size) { 1801 f_id++; 1802 f_size = skb_frag_size(&sh->frags[f_id]); 1803 f_used = 0; 1804 } 1805 1806 /* Use bytes from the current fragment. */ 1807 n = p_len - p_used; 1808 if (n > f_size - f_used) 1809 n = f_size - f_used; 1810 f_used += n; 1811 p_used += n; 1812 1813 slot++; 1814 } 1815 1816 seq += p_len; 1817 1818 /* The last segment may be less than gso_size. */ 1819 data_len -= p_len; 1820 if (data_len < p_len) 1821 p_len = data_len; 1822 } 1823 1824 /* Flush the headers so they are ready for hardware DMA. */ 1825 wmb(); 1826} 1827 1828/* Pass all the data to mpipe for egress. */ 1829static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue, 1830 struct sk_buff *skb, unsigned char *headers, s64 slot) 1831{ 1832 struct skb_shared_info *sh = skb_shinfo(skb); 1833 int instance = mpipe_instance(dev); 1834 struct mpipe_data *md = &mpipe_data[instance]; 1835 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb); 1836 unsigned int data_len = skb->len - sh_len; 1837 unsigned int p_len = sh->gso_size; 1838 gxio_mpipe_edesc_t edesc_head = { { 0 } }; 1839 gxio_mpipe_edesc_t edesc_body = { { 0 } }; 1840 long f_id = -1; /* id of the current fragment */ 1841 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */ 1842 long f_used = 0; /* bytes used from the current fragment */ 1843 void *f_data = skb->data + sh_len; 1844 long n; /* size of the current piece of payload */ 1845 unsigned long tx_packets = 0, tx_bytes = 0; 1846 unsigned int csum_start; 1847 int segment; 1848 1849 /* Prepare to egress the headers: set up header edesc. */ 1850 csum_start = skb_checksum_start_offset(skb); 1851 edesc_head.csum = 1; 1852 edesc_head.csum_start = csum_start; 1853 edesc_head.csum_dest = csum_start + skb->csum_offset; 1854 edesc_head.xfer_size = sh_len; 1855 1856 /* This is only used to specify the TLB. */ 1857 edesc_head.stack_idx = md->first_buffer_stack; 1858 edesc_body.stack_idx = md->first_buffer_stack; 1859 1860 /* Egress all the edescs. */ 1861 for (segment = 0; segment < sh->gso_segs; segment++) { 1862 unsigned char *buf; 1863 unsigned int p_used = 0; 1864 1865 /* Egress the header. */ 1866 buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES + 1867 NET_IP_ALIGN; 1868 edesc_head.va = va_to_tile_io_addr(buf); 1869 gxio_mpipe_equeue_put_at(equeue, edesc_head, slot); 1870 slot++; 1871 1872 /* Egress the payload. */ 1873 while (p_used < p_len) { 1874 void *va; 1875 1876 /* Advance as needed. */ 1877 while (f_used >= f_size) { 1878 f_id++; 1879 f_size = skb_frag_size(&sh->frags[f_id]); 1880 f_data = tile_net_frag_buf(&sh->frags[f_id]); 1881 f_used = 0; 1882 } 1883 1884 va = f_data + f_used; 1885 1886 /* Use bytes from the current fragment. */ 1887 n = p_len - p_used; 1888 if (n > f_size - f_used) 1889 n = f_size - f_used; 1890 f_used += n; 1891 p_used += n; 1892 1893 /* Egress a piece of the payload. */ 1894 edesc_body.va = va_to_tile_io_addr(va); 1895 edesc_body.xfer_size = n; 1896 edesc_body.bound = !(p_used < p_len); 1897 gxio_mpipe_equeue_put_at(equeue, edesc_body, slot); 1898 slot++; 1899 } 1900 1901 tx_packets++; 1902 tx_bytes += sh_len + p_len; 1903 1904 /* The last segment may be less than gso_size. */ 1905 data_len -= p_len; 1906 if (data_len < p_len) 1907 p_len = data_len; 1908 } 1909 1910 /* Update stats. */ 1911 tile_net_stats_add(tx_packets, &dev->stats.tx_packets); 1912 tile_net_stats_add(tx_bytes, &dev->stats.tx_bytes); 1913} 1914 1915/* Do "TSO" handling for egress. 1916 * 1917 * Normally drivers set NETIF_F_TSO only to support hardware TSO; 1918 * otherwise the stack uses scatter-gather to implement GSO in software. 1919 * On our testing, enabling GSO support (via NETIF_F_SG) drops network 1920 * performance down to around 7.5 Gbps on the 10G interfaces, although 1921 * also dropping cpu utilization way down, to under 8%. But 1922 * implementing "TSO" in the driver brings performance back up to line 1923 * rate, while dropping cpu usage even further, to less than 4%. In 1924 * practice, profiling of GSO shows that skb_segment() is what causes 1925 * the performance overheads; we benefit in the driver from using 1926 * preallocated memory to duplicate the TCP/IP headers. 1927 */ 1928static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev) 1929{ 1930 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 1931 struct tile_net_priv *priv = netdev_priv(dev); 1932 int channel = priv->echannel; 1933 int instance = priv->instance; 1934 struct mpipe_data *md = &mpipe_data[instance]; 1935 struct tile_net_egress *egress = &md->egress_for_echannel[channel]; 1936 struct tile_net_comps *comps = 1937 info->mpipe[instance].comps_for_echannel[channel]; 1938 gxio_mpipe_equeue_t *equeue = egress->equeue; 1939 unsigned long irqflags; 1940 int num_edescs; 1941 s64 slot; 1942 1943 /* Determine how many mpipe edesc's are needed. */ 1944 num_edescs = tso_count_edescs(skb); 1945 1946 local_irq_save(irqflags); 1947 1948 /* Try to acquire a completion entry and an egress slot. */ 1949 slot = tile_net_equeue_try_reserve(dev, skb->queue_mapping, comps, 1950 equeue, num_edescs); 1951 if (slot < 0) { 1952 local_irq_restore(irqflags); 1953 return NETDEV_TX_BUSY; 1954 } 1955 1956 /* Set up copies of header data properly. */ 1957 tso_headers_prepare(skb, egress->headers, slot); 1958 1959 /* Actually pass the data to the network hardware. */ 1960 tso_egress(dev, equeue, skb, egress->headers, slot); 1961 1962 /* Add a completion record. */ 1963 add_comp(equeue, comps, slot + num_edescs - 1, skb); 1964 1965 local_irq_restore(irqflags); 1966 1967 /* Make sure the egress timer is scheduled. */ 1968 tile_net_schedule_egress_timer(); 1969 1970 return NETDEV_TX_OK; 1971} 1972 1973/* Analyze the body and frags for a transmit request. */ 1974static unsigned int tile_net_tx_frags(struct frag *frags, 1975 struct sk_buff *skb, 1976 void *b_data, unsigned int b_len) 1977{ 1978 unsigned int i, n = 0; 1979 1980 struct skb_shared_info *sh = skb_shinfo(skb); 1981 1982 if (b_len != 0) { 1983 frags[n].buf = b_data; 1984 frags[n++].length = b_len; 1985 } 1986 1987 for (i = 0; i < sh->nr_frags; i++) { 1988 skb_frag_t *f = &sh->frags[i]; 1989 frags[n].buf = tile_net_frag_buf(f); 1990 frags[n++].length = skb_frag_size(f); 1991 } 1992 1993 return n; 1994} 1995 1996/* Help the kernel transmit a packet. */ 1997static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) 1998{ 1999 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 2000 struct tile_net_priv *priv = netdev_priv(dev); 2001 int instance = priv->instance; 2002 struct mpipe_data *md = &mpipe_data[instance]; 2003 struct tile_net_egress *egress = 2004 &md->egress_for_echannel[priv->echannel]; 2005 gxio_mpipe_equeue_t *equeue = egress->equeue; 2006 struct tile_net_comps *comps = 2007 info->mpipe[instance].comps_for_echannel[priv->echannel]; 2008 unsigned int len = skb->len; 2009 unsigned char *data = skb->data; 2010 unsigned int num_edescs; 2011 struct frag frags[MAX_FRAGS]; 2012 gxio_mpipe_edesc_t edescs[MAX_FRAGS]; 2013 unsigned long irqflags; 2014 gxio_mpipe_edesc_t edesc = { { 0 } }; 2015 unsigned int i; 2016 s64 slot; 2017 2018 if (skb_is_gso(skb)) 2019 return tile_net_tx_tso(skb, dev); 2020 2021 num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb)); 2022 2023 /* This is only used to specify the TLB. */ 2024 edesc.stack_idx = md->first_buffer_stack; 2025 2026 /* Prepare the edescs. */ 2027 for (i = 0; i < num_edescs; i++) { 2028 edesc.xfer_size = frags[i].length; 2029 edesc.va = va_to_tile_io_addr(frags[i].buf); 2030 edescs[i] = edesc; 2031 } 2032 2033 /* Mark the final edesc. */ 2034 edescs[num_edescs - 1].bound = 1; 2035 2036 /* Add checksum info to the initial edesc, if needed. */ 2037 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2038 unsigned int csum_start = skb_checksum_start_offset(skb); 2039 edescs[0].csum = 1; 2040 edescs[0].csum_start = csum_start; 2041 edescs[0].csum_dest = csum_start + skb->csum_offset; 2042 } 2043 2044 local_irq_save(irqflags); 2045 2046 /* Try to acquire a completion entry and an egress slot. */ 2047 slot = tile_net_equeue_try_reserve(dev, skb->queue_mapping, comps, 2048 equeue, num_edescs); 2049 if (slot < 0) { 2050 local_irq_restore(irqflags); 2051 return NETDEV_TX_BUSY; 2052 } 2053 2054 for (i = 0; i < num_edescs; i++) 2055 gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++); 2056 2057 /* Store TX timestamp if needed. */ 2058 tile_tx_timestamp(skb, instance); 2059 2060 /* Add a completion record. */ 2061 add_comp(equeue, comps, slot - 1, skb); 2062 2063 /* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */ 2064 tile_net_stats_add(1, &dev->stats.tx_packets); 2065 tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN), 2066 &dev->stats.tx_bytes); 2067 2068 local_irq_restore(irqflags); 2069 2070 /* Make sure the egress timer is scheduled. */ 2071 tile_net_schedule_egress_timer(); 2072 2073 return NETDEV_TX_OK; 2074} 2075 2076/* Return subqueue id on this core (one per core). */ 2077static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb, 2078 void *accel_priv, select_queue_fallback_t fallback) 2079{ 2080 return smp_processor_id(); 2081} 2082 2083/* Deal with a transmit timeout. */ 2084static void tile_net_tx_timeout(struct net_device *dev) 2085{ 2086 int cpu; 2087 2088 for_each_online_cpu(cpu) 2089 netif_wake_subqueue(dev, cpu); 2090} 2091 2092/* Ioctl commands. */ 2093static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) 2094{ 2095 if (cmd == SIOCSHWTSTAMP) 2096 return tile_hwtstamp_set(dev, rq); 2097 if (cmd == SIOCGHWTSTAMP) 2098 return tile_hwtstamp_get(dev, rq); 2099 2100 return -EOPNOTSUPP; 2101} 2102 2103/* Change the MTU. */ 2104static int tile_net_change_mtu(struct net_device *dev, int new_mtu) 2105{ 2106 if (new_mtu < 68) 2107 return -EINVAL; 2108 if (new_mtu > ((jumbo_num != 0) ? 9000 : 1500)) 2109 return -EINVAL; 2110 dev->mtu = new_mtu; 2111 return 0; 2112} 2113 2114/* Change the Ethernet address of the NIC. 2115 * 2116 * The hypervisor driver does not support changing MAC address. However, 2117 * the hardware does not do anything with the MAC address, so the address 2118 * which gets used on outgoing packets, and which is accepted on incoming 2119 * packets, is completely up to us. 2120 * 2121 * Returns 0 on success, negative on failure. 2122 */ 2123static int tile_net_set_mac_address(struct net_device *dev, void *p) 2124{ 2125 struct sockaddr *addr = p; 2126 2127 if (!is_valid_ether_addr(addr->sa_data)) 2128 return -EINVAL; 2129 memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); 2130 return 0; 2131} 2132 2133#ifdef CONFIG_NET_POLL_CONTROLLER 2134/* Polling 'interrupt' - used by things like netconsole to send skbs 2135 * without having to re-enable interrupts. It's not called while 2136 * the interrupt routine is executing. 2137 */ 2138static void tile_net_netpoll(struct net_device *dev) 2139{ 2140 int instance = mpipe_instance(dev); 2141 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 2142 struct mpipe_data *md = &mpipe_data[instance]; 2143 2144 disable_percpu_irq(md->ingress_irq); 2145 napi_schedule(&info->mpipe[instance].napi); 2146 enable_percpu_irq(md->ingress_irq, 0); 2147} 2148#endif 2149 2150static const struct net_device_ops tile_net_ops = { 2151 .ndo_open = tile_net_open, 2152 .ndo_stop = tile_net_stop, 2153 .ndo_start_xmit = tile_net_tx, 2154 .ndo_select_queue = tile_net_select_queue, 2155 .ndo_do_ioctl = tile_net_ioctl, 2156 .ndo_change_mtu = tile_net_change_mtu, 2157 .ndo_tx_timeout = tile_net_tx_timeout, 2158 .ndo_set_mac_address = tile_net_set_mac_address, 2159#ifdef CONFIG_NET_POLL_CONTROLLER 2160 .ndo_poll_controller = tile_net_netpoll, 2161#endif 2162}; 2163 2164/* The setup function. 2165 * 2166 * This uses ether_setup() to assign various fields in dev, including 2167 * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields. 2168 */ 2169static void tile_net_setup(struct net_device *dev) 2170{ 2171 netdev_features_t features = 0; 2172 2173 ether_setup(dev); 2174 dev->netdev_ops = &tile_net_ops; 2175 dev->watchdog_timeo = TILE_NET_TIMEOUT; 2176 dev->mtu = 1500; 2177 2178 features |= NETIF_F_HW_CSUM; 2179 features |= NETIF_F_SG; 2180 features |= NETIF_F_TSO; 2181 features |= NETIF_F_TSO6; 2182 2183 dev->hw_features |= features; 2184 dev->vlan_features |= features; 2185 dev->features |= features; 2186} 2187 2188/* Allocate the device structure, register the device, and obtain the 2189 * MAC address from the hypervisor. 2190 */ 2191static void tile_net_dev_init(const char *name, const uint8_t *mac) 2192{ 2193 int ret; 2194 struct net_device *dev; 2195 struct tile_net_priv *priv; 2196 2197 /* HACK: Ignore "loop" links. */ 2198 if (strncmp(name, "loop", 4) == 0) 2199 return; 2200 2201 /* Allocate the device structure. Normally, "name" is a 2202 * template, instantiated by register_netdev(), but not for us. 2203 */ 2204 dev = alloc_netdev_mqs(sizeof(*priv), name, NET_NAME_UNKNOWN, 2205 tile_net_setup, NR_CPUS, 1); 2206 if (!dev) { 2207 pr_err("alloc_netdev_mqs(%s) failed\n", name); 2208 return; 2209 } 2210 2211 /* Initialize "priv". */ 2212 priv = netdev_priv(dev); 2213 priv->dev = dev; 2214 priv->channel = -1; 2215 priv->loopify_channel = -1; 2216 priv->echannel = -1; 2217 init_ptp_dev(priv); 2218 2219 /* Get the MAC address and set it in the device struct; this must 2220 * be done before the device is opened. If the MAC is all zeroes, 2221 * we use a random address, since we're probably on the simulator. 2222 */ 2223 if (!is_zero_ether_addr(mac)) 2224 ether_addr_copy(dev->dev_addr, mac); 2225 else 2226 eth_hw_addr_random(dev); 2227 2228 /* Register the network device. */ 2229 ret = register_netdev(dev); 2230 if (ret) { 2231 netdev_err(dev, "register_netdev failed %d\n", ret); 2232 free_netdev(dev); 2233 return; 2234 } 2235} 2236 2237/* Per-cpu module initialization. */ 2238static void tile_net_init_module_percpu(void *unused) 2239{ 2240 struct tile_net_info *info = this_cpu_ptr(&per_cpu_info); 2241 int my_cpu = smp_processor_id(); 2242 int instance; 2243 2244 for (instance = 0; instance < NR_MPIPE_MAX; instance++) { 2245 info->mpipe[instance].has_iqueue = false; 2246 info->mpipe[instance].instance = instance; 2247 } 2248 info->my_cpu = my_cpu; 2249 2250 /* Initialize the egress timer. */ 2251 hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2252 info->egress_timer.function = tile_net_handle_egress_timer; 2253} 2254 2255/* Module initialization. */ 2256static int __init tile_net_init_module(void) 2257{ 2258 int i; 2259 char name[GXIO_MPIPE_LINK_NAME_LEN]; 2260 uint8_t mac[6]; 2261 2262 pr_info("Tilera Network Driver\n"); 2263 2264 BUILD_BUG_ON(NR_MPIPE_MAX != 2); 2265 2266 mutex_init(&tile_net_devs_for_channel_mutex); 2267 2268 /* Initialize each CPU. */ 2269 on_each_cpu(tile_net_init_module_percpu, NULL, 1); 2270 2271 /* Find out what devices we have, and initialize them. */ 2272 for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++) 2273 tile_net_dev_init(name, mac); 2274 2275 if (!network_cpus_init()) 2276 network_cpus_map = *cpu_online_mask; 2277 2278 return 0; 2279} 2280 2281module_init(tile_net_init_module); 2282