1/* 2 * 3 * This file is provided under a dual BSD/GPLv2 license. When using or 4 * redistributing this file, you may do so under either license. 5 * 6 * GPL LICENSE SUMMARY 7 * 8 * Copyright(c) 2015 Intel Corporation. 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License as 12 * published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * General Public License for more details. 18 * 19 * BSD LICENSE 20 * 21 * Copyright(c) 2015 Intel Corporation. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 27 * - Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * - Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in 31 * the documentation and/or other materials provided with the 32 * distribution. 33 * - Neither the name of Intel Corporation nor the names of its 34 * contributors may be used to endorse or promote products derived 35 * from this software without specific prior written permission. 36 * 37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 48 * 49 */ 50 51#include <linux/pci.h> 52#include <linux/netdevice.h> 53#include <linux/vmalloc.h> 54#include <linux/delay.h> 55#include <linux/idr.h> 56#include <linux/module.h> 57#include <linux/printk.h> 58#include <linux/hrtimer.h> 59 60#include "hfi.h" 61#include "device.h" 62#include "common.h" 63#include "mad.h" 64#include "sdma.h" 65#include "debugfs.h" 66#include "verbs.h" 67 68#undef pr_fmt 69#define pr_fmt(fmt) DRIVER_NAME ": " fmt 70 71/* 72 * min buffers we want to have per context, after driver 73 */ 74#define HFI1_MIN_USER_CTXT_BUFCNT 7 75 76#define HFI1_MIN_HDRQ_EGRBUF_CNT 2 77#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 78#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 79 80/* 81 * Number of user receive contexts we are configured to use (to allow for more 82 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 83 */ 84uint num_rcv_contexts; 85module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO); 86MODULE_PARM_DESC( 87 num_rcv_contexts, "Set max number of user receive contexts to use"); 88 89u8 krcvqs[RXE_NUM_DATA_VL]; 90int krcvqsset; 91module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO); 92MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL"); 93 94/* computed based on above array */ 95unsigned n_krcvqs; 96 97static unsigned hfi1_rcvarr_split = 25; 98module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 99MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 100 101static uint eager_buffer_size = (2 << 20); /* 2MB */ 102module_param(eager_buffer_size, uint, S_IRUGO); 103MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB"); 104 105static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 106module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 107MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 108 109static uint hfi1_hdrq_entsize = 32; 110module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); 111MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); 112 113unsigned int user_credit_return_threshold = 33; /* default is 33% */ 114module_param(user_credit_return_threshold, uint, S_IRUGO); 115MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 116 117static inline u64 encode_rcv_header_entry_size(u16); 118 119static struct idr hfi1_unit_table; 120u32 hfi1_cpulist_count; 121unsigned long *hfi1_cpulist; 122 123/* 124 * Common code for creating the receive context array. 125 */ 126int hfi1_create_ctxts(struct hfi1_devdata *dd) 127{ 128 unsigned i; 129 int ret; 130 int local_node_id = pcibus_to_node(dd->pcidev->bus); 131 132 if (local_node_id < 0) 133 local_node_id = numa_node_id(); 134 dd->assigned_node_id = local_node_id; 135 136 dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL); 137 if (!dd->rcd) 138 goto nomem; 139 140 /* create one or more kernel contexts */ 141 for (i = 0; i < dd->first_user_ctxt; ++i) { 142 struct hfi1_pportdata *ppd; 143 struct hfi1_ctxtdata *rcd; 144 145 ppd = dd->pport + (i % dd->num_pports); 146 rcd = hfi1_create_ctxtdata(ppd, i); 147 if (!rcd) { 148 dd_dev_err(dd, 149 "Unable to allocate kernel receive context, failing\n"); 150 goto nomem; 151 } 152 /* 153 * Set up the kernel context flags here and now because they 154 * use default values for all receive side memories. User 155 * contexts will be handled as they are created. 156 */ 157 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 158 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 159 HFI1_CAP_KGET(NODROP_EGR_FULL) | 160 HFI1_CAP_KGET(DMA_RTAIL); 161 rcd->seq_cnt = 1; 162 163 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 164 if (!rcd->sc) { 165 dd_dev_err(dd, 166 "Unable to allocate kernel send context, failing\n"); 167 dd->rcd[rcd->ctxt] = NULL; 168 hfi1_free_ctxtdata(dd, rcd); 169 goto nomem; 170 } 171 172 ret = hfi1_init_ctxt(rcd->sc); 173 if (ret < 0) { 174 dd_dev_err(dd, 175 "Failed to setup kernel receive context, failing\n"); 176 sc_free(rcd->sc); 177 dd->rcd[rcd->ctxt] = NULL; 178 hfi1_free_ctxtdata(dd, rcd); 179 ret = -EFAULT; 180 goto bail; 181 } 182 } 183 184 return 0; 185nomem: 186 ret = -ENOMEM; 187bail: 188 kfree(dd->rcd); 189 dd->rcd = NULL; 190 return ret; 191} 192 193/* 194 * Common code for user and kernel context setup. 195 */ 196struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) 197{ 198 struct hfi1_devdata *dd = ppd->dd; 199 struct hfi1_ctxtdata *rcd; 200 unsigned kctxt_ngroups = 0; 201 u32 base; 202 203 if (dd->rcv_entries.nctxt_extra > 204 dd->num_rcv_contexts - dd->first_user_ctxt) 205 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 206 (dd->num_rcv_contexts - dd->first_user_ctxt)); 207 rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); 208 if (rcd) { 209 u32 rcvtids, max_entries; 210 211 dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt); 212 213 INIT_LIST_HEAD(&rcd->qp_wait_list); 214 rcd->ppd = ppd; 215 rcd->dd = dd; 216 rcd->cnt = 1; 217 rcd->ctxt = ctxt; 218 dd->rcd[ctxt] = rcd; 219 rcd->numa_id = numa_node_id(); 220 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 221 222 spin_lock_init(&rcd->exp_lock); 223 224 /* 225 * Calculate the context's RcvArray entry starting point. 226 * We do this here because we have to take into account all 227 * the RcvArray entries that previous context would have 228 * taken and we have to account for any extra groups 229 * assigned to the kernel or user contexts. 230 */ 231 if (ctxt < dd->first_user_ctxt) { 232 if (ctxt < kctxt_ngroups) { 233 base = ctxt * (dd->rcv_entries.ngroups + 1); 234 rcd->rcv_array_groups++; 235 } else 236 base = kctxt_ngroups + 237 (ctxt * dd->rcv_entries.ngroups); 238 } else { 239 u16 ct = ctxt - dd->first_user_ctxt; 240 241 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 242 kctxt_ngroups); 243 if (ct < dd->rcv_entries.nctxt_extra) { 244 base += ct * (dd->rcv_entries.ngroups + 1); 245 rcd->rcv_array_groups++; 246 } else 247 base += dd->rcv_entries.nctxt_extra + 248 (ct * dd->rcv_entries.ngroups); 249 } 250 rcd->eager_base = base * dd->rcv_entries.group_size; 251 252 /* Validate and initialize Rcv Hdr Q variables */ 253 if (rcvhdrcnt % HDRQ_INCREMENT) { 254 dd_dev_err(dd, 255 "ctxt%u: header queue count %d must be divisible by %d\n", 256 rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); 257 goto bail; 258 } 259 rcd->rcvhdrq_cnt = rcvhdrcnt; 260 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 261 /* 262 * Simple Eager buffer allocation: we have already pre-allocated 263 * the number of RcvArray entry groups. Each ctxtdata structure 264 * holds the number of groups for that context. 265 * 266 * To follow CSR requirements and maintain cacheline alignment, 267 * make sure all sizes and bases are multiples of group_size. 268 * 269 * The expected entry count is what is left after assigning 270 * eager. 271 */ 272 max_entries = rcd->rcv_array_groups * 273 dd->rcv_entries.group_size; 274 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 275 rcd->egrbufs.count = round_down(rcvtids, 276 dd->rcv_entries.group_size); 277 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 278 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 279 rcd->ctxt); 280 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 281 } 282 dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n", 283 rcd->ctxt, rcd->egrbufs.count); 284 285 /* 286 * Allocate array that will hold the eager buffer accounting 287 * data. 288 * This will allocate the maximum possible buffer count based 289 * on the value of the RcvArray split parameter. 290 * The resulting value will be rounded down to the closest 291 * multiple of dd->rcv_entries.group_size. 292 */ 293 rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count, 294 sizeof(*rcd->egrbufs.buffers), 295 GFP_KERNEL); 296 if (!rcd->egrbufs.buffers) 297 goto bail; 298 rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count, 299 sizeof(*rcd->egrbufs.rcvtids), 300 GFP_KERNEL); 301 if (!rcd->egrbufs.rcvtids) 302 goto bail; 303 rcd->egrbufs.size = eager_buffer_size; 304 /* 305 * The size of the buffers programmed into the RcvArray 306 * entries needs to be big enough to handle the highest 307 * MTU supported. 308 */ 309 if (rcd->egrbufs.size < hfi1_max_mtu) { 310 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 311 dd_dev_info(dd, 312 "ctxt%u: eager bufs size too small. Adjusting to %zu\n", 313 rcd->ctxt, rcd->egrbufs.size); 314 } 315 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 316 317 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ 318 rcd->opstats = kzalloc(sizeof(*rcd->opstats), 319 GFP_KERNEL); 320 if (!rcd->opstats) 321 goto bail; 322 } 323 } 324 return rcd; 325bail: 326 kfree(rcd->opstats); 327 kfree(rcd->egrbufs.rcvtids); 328 kfree(rcd->egrbufs.buffers); 329 kfree(rcd); 330 return NULL; 331} 332 333/* 334 * Convert a receive header entry size that to the encoding used in the CSR. 335 * 336 * Return a zero if the given size is invalid. 337 */ 338static inline u64 encode_rcv_header_entry_size(u16 size) 339{ 340 /* there are only 3 valid receive header entry sizes */ 341 if (size == 2) 342 return 1; 343 if (size == 16) 344 return 2; 345 else if (size == 32) 346 return 4; 347 return 0; /* invalid */ 348} 349 350/* 351 * Select the largest ccti value over all SLs to determine the intra- 352 * packet gap for the link. 353 * 354 * called with cca_timer_lock held (to protect access to cca_timer 355 * array), and rcu_read_lock() (to protect access to cc_state). 356 */ 357void set_link_ipg(struct hfi1_pportdata *ppd) 358{ 359 struct hfi1_devdata *dd = ppd->dd; 360 struct cc_state *cc_state; 361 int i; 362 u16 cce, ccti_limit, max_ccti = 0; 363 u16 shift, mult; 364 u64 src; 365 u32 current_egress_rate; /* Mbits /sec */ 366 u32 max_pkt_time; 367 /* 368 * max_pkt_time is the maximum packet egress time in units 369 * of the fabric clock period 1/(805 MHz). 370 */ 371 372 cc_state = get_cc_state(ppd); 373 374 if (cc_state == NULL) 375 /* 376 * This should _never_ happen - rcu_read_lock() is held, 377 * and set_link_ipg() should not be called if cc_state 378 * is NULL. 379 */ 380 return; 381 382 for (i = 0; i < OPA_MAX_SLS; i++) { 383 u16 ccti = ppd->cca_timer[i].ccti; 384 385 if (ccti > max_ccti) 386 max_ccti = ccti; 387 } 388 389 ccti_limit = cc_state->cct.ccti_limit; 390 if (max_ccti > ccti_limit) 391 max_ccti = ccti_limit; 392 393 cce = cc_state->cct.entries[max_ccti].entry; 394 shift = (cce & 0xc000) >> 14; 395 mult = (cce & 0x3fff); 396 397 current_egress_rate = active_egress_rate(ppd); 398 399 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 400 401 src = (max_pkt_time >> shift) * mult; 402 403 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 404 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 405 406 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 407} 408 409static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 410{ 411 struct cca_timer *cca_timer; 412 struct hfi1_pportdata *ppd; 413 int sl; 414 u16 ccti, ccti_timer, ccti_min; 415 struct cc_state *cc_state; 416 unsigned long flags; 417 418 cca_timer = container_of(t, struct cca_timer, hrtimer); 419 ppd = cca_timer->ppd; 420 sl = cca_timer->sl; 421 422 rcu_read_lock(); 423 424 cc_state = get_cc_state(ppd); 425 426 if (cc_state == NULL) { 427 rcu_read_unlock(); 428 return HRTIMER_NORESTART; 429 } 430 431 /* 432 * 1) decrement ccti for SL 433 * 2) calculate IPG for link (set_link_ipg()) 434 * 3) restart timer, unless ccti is at min value 435 */ 436 437 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 438 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 439 440 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 441 442 ccti = cca_timer->ccti; 443 444 if (ccti > ccti_min) { 445 cca_timer->ccti--; 446 set_link_ipg(ppd); 447 } 448 449 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 450 451 rcu_read_unlock(); 452 453 if (ccti > ccti_min) { 454 unsigned long nsec = 1024 * ccti_timer; 455 /* ccti_timer is in units of 1.024 usec */ 456 hrtimer_forward_now(t, ns_to_ktime(nsec)); 457 return HRTIMER_RESTART; 458 } 459 return HRTIMER_NORESTART; 460} 461 462/* 463 * Common code for initializing the physical port structure. 464 */ 465void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 466 struct hfi1_devdata *dd, u8 hw_pidx, u8 port) 467{ 468 int i, size; 469 uint default_pkey_idx; 470 471 ppd->dd = dd; 472 ppd->hw_pidx = hw_pidx; 473 ppd->port = port; /* IB port number, not index */ 474 475 default_pkey_idx = 1; 476 477 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 478 if (loopback) { 479 hfi1_early_err(&pdev->dev, 480 "Faking data partition 0x8001 in idx %u\n", 481 !default_pkey_idx); 482 ppd->pkeys[!default_pkey_idx] = 0x8001; 483 } 484 485 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 486 INIT_WORK(&ppd->link_up_work, handle_link_up); 487 INIT_WORK(&ppd->link_down_work, handle_link_down); 488 INIT_WORK(&ppd->freeze_work, handle_freeze); 489 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 490 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 491 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 492 mutex_init(&ppd->hls_lock); 493 spin_lock_init(&ppd->sdma_alllock); 494 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 495 496 ppd->sm_trap_qp = 0x0; 497 ppd->sa_qp = 0x1; 498 499 ppd->hfi1_wq = NULL; 500 501 spin_lock_init(&ppd->cca_timer_lock); 502 503 for (i = 0; i < OPA_MAX_SLS; i++) { 504 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, 505 HRTIMER_MODE_REL); 506 ppd->cca_timer[i].ppd = ppd; 507 ppd->cca_timer[i].sl = i; 508 ppd->cca_timer[i].ccti = 0; 509 ppd->cca_timer[i].hrtimer.function = cca_timer_fn; 510 } 511 512 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 513 514 spin_lock_init(&ppd->cc_state_lock); 515 spin_lock_init(&ppd->cc_log_lock); 516 size = sizeof(struct cc_state); 517 RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); 518 if (!rcu_dereference(ppd->cc_state)) 519 goto bail; 520 return; 521 522bail: 523 524 hfi1_early_err(&pdev->dev, 525 "Congestion Control Agent disabled for port %d\n", port); 526} 527 528/* 529 * Do initialization for device that is only needed on 530 * first detect, not on resets. 531 */ 532static int loadtime_init(struct hfi1_devdata *dd) 533{ 534 return 0; 535} 536 537/** 538 * init_after_reset - re-initialize after a reset 539 * @dd: the hfi1_ib device 540 * 541 * sanity check at least some of the values after reset, and 542 * ensure no receive or transmit (explicitly, in case reset 543 * failed 544 */ 545static int init_after_reset(struct hfi1_devdata *dd) 546{ 547 int i; 548 549 /* 550 * Ensure chip does no sends or receives, tail updates, or 551 * pioavail updates while we re-initialize. This is mostly 552 * for the driver data structures, not chip registers. 553 */ 554 for (i = 0; i < dd->num_rcv_contexts; i++) 555 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 556 HFI1_RCVCTRL_INTRAVAIL_DIS | 557 HFI1_RCVCTRL_TAILUPD_DIS, i); 558 pio_send_control(dd, PSC_GLOBAL_DISABLE); 559 for (i = 0; i < dd->num_send_contexts; i++) 560 sc_disable(dd->send_contexts[i].sc); 561 562 return 0; 563} 564 565static void enable_chip(struct hfi1_devdata *dd) 566{ 567 u32 rcvmask; 568 u32 i; 569 570 /* enable PIO send */ 571 pio_send_control(dd, PSC_GLOBAL_ENABLE); 572 573 /* 574 * Enable kernel ctxts' receive and receive interrupt. 575 * Other ctxts done as user opens and initializes them. 576 */ 577 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 578 for (i = 0; i < dd->first_user_ctxt; ++i) { 579 rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? 580 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 581 if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) 582 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 583 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) 584 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 585 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) 586 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 587 hfi1_rcvctrl(dd, rcvmask, i); 588 sc_enable(dd->rcd[i]->sc); 589 } 590} 591 592/** 593 * create_workqueues - create per port workqueues 594 * @dd: the hfi1_ib device 595 */ 596static int create_workqueues(struct hfi1_devdata *dd) 597{ 598 int pidx; 599 struct hfi1_pportdata *ppd; 600 601 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 602 ppd = dd->pport + pidx; 603 if (!ppd->hfi1_wq) { 604 char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ 605 606 snprintf(wq_name, sizeof(wq_name), "hfi%d_%d", 607 dd->unit, pidx); 608 ppd->hfi1_wq = 609 create_singlethread_workqueue(wq_name); 610 if (!ppd->hfi1_wq) 611 goto wq_error; 612 } 613 } 614 return 0; 615wq_error: 616 pr_err("create_singlethread_workqueue failed for port %d\n", 617 pidx + 1); 618 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 619 ppd = dd->pport + pidx; 620 if (ppd->hfi1_wq) { 621 destroy_workqueue(ppd->hfi1_wq); 622 ppd->hfi1_wq = NULL; 623 } 624 } 625 return -ENOMEM; 626} 627 628/** 629 * hfi1_init - do the actual initialization sequence on the chip 630 * @dd: the hfi1_ib device 631 * @reinit: re-initializing, so don't allocate new memory 632 * 633 * Do the actual initialization sequence on the chip. This is done 634 * both from the init routine called from the PCI infrastructure, and 635 * when we reset the chip, or detect that it was reset internally, 636 * or it's administratively re-enabled. 637 * 638 * Memory allocation here and in called routines is only done in 639 * the first case (reinit == 0). We have to be careful, because even 640 * without memory allocation, we need to re-write all the chip registers 641 * TIDs, etc. after the reset or enable has completed. 642 */ 643int hfi1_init(struct hfi1_devdata *dd, int reinit) 644{ 645 int ret = 0, pidx, lastfail = 0; 646 unsigned i, len; 647 struct hfi1_ctxtdata *rcd; 648 struct hfi1_pportdata *ppd; 649 650 /* Set up recv low level handlers */ 651 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = 652 kdeth_process_expected; 653 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = 654 kdeth_process_eager; 655 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; 656 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = 657 process_receive_error; 658 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = 659 process_receive_bypass; 660 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = 661 process_receive_invalid; 662 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = 663 process_receive_invalid; 664 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = 665 process_receive_invalid; 666 dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; 667 668 /* Set up send low level handlers */ 669 dd->process_pio_send = hfi1_verbs_send_pio; 670 dd->process_dma_send = hfi1_verbs_send_dma; 671 dd->pio_inline_send = pio_copy; 672 673 if (is_a0(dd)) { 674 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 675 dd->do_drop = 1; 676 } else { 677 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 678 dd->do_drop = 0; 679 } 680 681 /* make sure the link is not "up" */ 682 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 683 ppd = dd->pport + pidx; 684 ppd->linkup = 0; 685 } 686 687 if (reinit) 688 ret = init_after_reset(dd); 689 else 690 ret = loadtime_init(dd); 691 if (ret) 692 goto done; 693 694 /* dd->rcd can be NULL if early initialization failed */ 695 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 696 /* 697 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 698 * re-init, the simplest way to handle this is to free 699 * existing, and re-allocate. 700 * Need to re-create rest of ctxt 0 ctxtdata as well. 701 */ 702 rcd = dd->rcd[i]; 703 if (!rcd) 704 continue; 705 706 rcd->do_interrupt = &handle_receive_interrupt; 707 708 lastfail = hfi1_create_rcvhdrq(dd, rcd); 709 if (!lastfail) 710 lastfail = hfi1_setup_eagerbufs(rcd); 711 if (lastfail) 712 dd_dev_err(dd, 713 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 714 } 715 if (lastfail) 716 ret = lastfail; 717 718 /* Allocate enough memory for user event notification. */ 719 len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * 720 sizeof(*dd->events), PAGE_SIZE); 721 dd->events = vmalloc_user(len); 722 if (!dd->events) 723 dd_dev_err(dd, "Failed to allocate user events page\n"); 724 /* 725 * Allocate a page for device and port status. 726 * Page will be shared amongst all user processes. 727 */ 728 dd->status = vmalloc_user(PAGE_SIZE); 729 if (!dd->status) 730 dd_dev_err(dd, "Failed to allocate dev status page\n"); 731 else 732 dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - 733 sizeof(dd->status->freezemsg)); 734 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 735 ppd = dd->pport + pidx; 736 if (dd->status) 737 /* Currently, we only have one port */ 738 ppd->statusp = &dd->status->port; 739 740 set_mtu(ppd); 741 } 742 743 /* enable chip even if we have an error, so we can debug cause */ 744 enable_chip(dd); 745 746 ret = hfi1_cq_init(dd); 747done: 748 /* 749 * Set status even if port serdes is not initialized 750 * so that diags will work. 751 */ 752 if (dd->status) 753 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 754 HFI1_STATUS_INITTED; 755 if (!ret) { 756 /* enable all interrupts from the chip */ 757 set_intr_state(dd, 1); 758 759 /* chip is OK for user apps; mark it as initialized */ 760 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 761 ppd = dd->pport + pidx; 762 763 /* initialize the qsfp if it exists 764 * Requires interrupts to be enabled so we are notified 765 * when the QSFP completes reset, and has 766 * to be done before bringing up the SERDES 767 */ 768 init_qsfp(ppd); 769 770 /* start the serdes - must be after interrupts are 771 enabled so we are notified when the link goes up */ 772 lastfail = bringup_serdes(ppd); 773 if (lastfail) 774 dd_dev_info(dd, 775 "Failed to bring up port %u\n", 776 ppd->port); 777 778 /* 779 * Set status even if port serdes is not initialized 780 * so that diags will work. 781 */ 782 if (ppd->statusp) 783 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 784 HFI1_STATUS_INITTED; 785 if (!ppd->link_speed_enabled) 786 continue; 787 } 788 } 789 790 /* if ret is non-zero, we probably should do some cleanup here... */ 791 return ret; 792} 793 794static inline struct hfi1_devdata *__hfi1_lookup(int unit) 795{ 796 return idr_find(&hfi1_unit_table, unit); 797} 798 799struct hfi1_devdata *hfi1_lookup(int unit) 800{ 801 struct hfi1_devdata *dd; 802 unsigned long flags; 803 804 spin_lock_irqsave(&hfi1_devs_lock, flags); 805 dd = __hfi1_lookup(unit); 806 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 807 808 return dd; 809} 810 811/* 812 * Stop the timers during unit shutdown, or after an error late 813 * in initialization. 814 */ 815static void stop_timers(struct hfi1_devdata *dd) 816{ 817 struct hfi1_pportdata *ppd; 818 int pidx; 819 820 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 821 ppd = dd->pport + pidx; 822 if (ppd->led_override_timer.data) { 823 del_timer_sync(&ppd->led_override_timer); 824 atomic_set(&ppd->led_override_timer_active, 0); 825 } 826 } 827} 828 829/** 830 * shutdown_device - shut down a device 831 * @dd: the hfi1_ib device 832 * 833 * This is called to make the device quiet when we are about to 834 * unload the driver, and also when the device is administratively 835 * disabled. It does not free any data structures. 836 * Everything it does has to be setup again by hfi1_init(dd, 1) 837 */ 838static void shutdown_device(struct hfi1_devdata *dd) 839{ 840 struct hfi1_pportdata *ppd; 841 unsigned pidx; 842 int i; 843 844 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 845 ppd = dd->pport + pidx; 846 847 ppd->linkup = 0; 848 if (ppd->statusp) 849 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 850 HFI1_STATUS_IB_READY); 851 } 852 dd->flags &= ~HFI1_INITTED; 853 854 /* mask interrupts, but not errors */ 855 set_intr_state(dd, 0); 856 857 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 858 ppd = dd->pport + pidx; 859 for (i = 0; i < dd->num_rcv_contexts; i++) 860 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 861 HFI1_RCVCTRL_CTXT_DIS | 862 HFI1_RCVCTRL_INTRAVAIL_DIS | 863 HFI1_RCVCTRL_PKEY_DIS | 864 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); 865 /* 866 * Gracefully stop all sends allowing any in progress to 867 * trickle out first. 868 */ 869 for (i = 0; i < dd->num_send_contexts; i++) 870 sc_flush(dd->send_contexts[i].sc); 871 } 872 873 /* 874 * Enough for anything that's going to trickle out to have actually 875 * done so. 876 */ 877 udelay(20); 878 879 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 880 ppd = dd->pport + pidx; 881 882 /* disable all contexts */ 883 for (i = 0; i < dd->num_send_contexts; i++) 884 sc_disable(dd->send_contexts[i].sc); 885 /* disable the send device */ 886 pio_send_control(dd, PSC_GLOBAL_DISABLE); 887 888 /* 889 * Clear SerdesEnable. 890 * We can't count on interrupts since we are stopping. 891 */ 892 hfi1_quiet_serdes(ppd); 893 894 if (ppd->hfi1_wq) { 895 destroy_workqueue(ppd->hfi1_wq); 896 ppd->hfi1_wq = NULL; 897 } 898 } 899 sdma_exit(dd); 900} 901 902/** 903 * hfi1_free_ctxtdata - free a context's allocated data 904 * @dd: the hfi1_ib device 905 * @rcd: the ctxtdata structure 906 * 907 * free up any allocated data for a context 908 * This should not touch anything that would affect a simultaneous 909 * re-allocation of context data, because it is called after hfi1_mutex 910 * is released (and can be called from reinit as well). 911 * It should never change any chip state, or global driver state. 912 */ 913void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 914{ 915 unsigned e; 916 917 if (!rcd) 918 return; 919 920 if (rcd->rcvhdrq) { 921 dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, 922 rcd->rcvhdrq, rcd->rcvhdrq_phys); 923 rcd->rcvhdrq = NULL; 924 if (rcd->rcvhdrtail_kvaddr) { 925 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 926 (void *)rcd->rcvhdrtail_kvaddr, 927 rcd->rcvhdrqtailaddr_phys); 928 rcd->rcvhdrtail_kvaddr = NULL; 929 } 930 } 931 932 /* all the RcvArray entries should have been cleared by now */ 933 kfree(rcd->egrbufs.rcvtids); 934 935 for (e = 0; e < rcd->egrbufs.alloced; e++) { 936 if (rcd->egrbufs.buffers[e].phys) 937 dma_free_coherent(&dd->pcidev->dev, 938 rcd->egrbufs.buffers[e].len, 939 rcd->egrbufs.buffers[e].addr, 940 rcd->egrbufs.buffers[e].phys); 941 } 942 kfree(rcd->egrbufs.buffers); 943 944 sc_free(rcd->sc); 945 vfree(rcd->physshadow); 946 vfree(rcd->tid_pg_list); 947 vfree(rcd->user_event_mask); 948 vfree(rcd->subctxt_uregbase); 949 vfree(rcd->subctxt_rcvegrbuf); 950 vfree(rcd->subctxt_rcvhdr_base); 951 kfree(rcd->tidusemap); 952 kfree(rcd->opstats); 953 kfree(rcd); 954} 955 956void hfi1_free_devdata(struct hfi1_devdata *dd) 957{ 958 unsigned long flags; 959 960 spin_lock_irqsave(&hfi1_devs_lock, flags); 961 idr_remove(&hfi1_unit_table, dd->unit); 962 list_del(&dd->list); 963 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 964 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 965 rcu_barrier(); /* wait for rcu callbacks to complete */ 966 free_percpu(dd->int_counter); 967 free_percpu(dd->rcv_limit); 968 ib_dealloc_device(&dd->verbs_dev.ibdev); 969} 970 971/* 972 * Allocate our primary per-unit data structure. Must be done via verbs 973 * allocator, because the verbs cleanup process both does cleanup and 974 * free of the data structure. 975 * "extra" is for chip-specific data. 976 * 977 * Use the idr mechanism to get a unit number for this unit. 978 */ 979struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) 980{ 981 unsigned long flags; 982 struct hfi1_devdata *dd; 983 int ret; 984 985 dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra); 986 if (!dd) 987 return ERR_PTR(-ENOMEM); 988 /* extra is * number of ports */ 989 dd->num_pports = extra / sizeof(struct hfi1_pportdata); 990 dd->pport = (struct hfi1_pportdata *)(dd + 1); 991 992 INIT_LIST_HEAD(&dd->list); 993 dd->node = dev_to_node(&pdev->dev); 994 if (dd->node < 0) 995 dd->node = 0; 996 idr_preload(GFP_KERNEL); 997 spin_lock_irqsave(&hfi1_devs_lock, flags); 998 999 ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); 1000 if (ret >= 0) { 1001 dd->unit = ret; 1002 list_add(&dd->list, &hfi1_dev_list); 1003 } 1004 1005 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1006 idr_preload_end(); 1007 1008 if (ret < 0) { 1009 hfi1_early_err(&pdev->dev, 1010 "Could not allocate unit ID: error %d\n", -ret); 1011 goto bail; 1012 } 1013 /* 1014 * Initialize all locks for the device. This needs to be as early as 1015 * possible so locks are usable. 1016 */ 1017 spin_lock_init(&dd->sc_lock); 1018 spin_lock_init(&dd->sendctrl_lock); 1019 spin_lock_init(&dd->rcvctrl_lock); 1020 spin_lock_init(&dd->uctxt_lock); 1021 spin_lock_init(&dd->hfi1_diag_trans_lock); 1022 spin_lock_init(&dd->sc_init_lock); 1023 spin_lock_init(&dd->dc8051_lock); 1024 spin_lock_init(&dd->dc8051_memlock); 1025 mutex_init(&dd->qsfp_i2c_mutex); 1026 seqlock_init(&dd->sc2vl_lock); 1027 spin_lock_init(&dd->sde_map_lock); 1028 init_waitqueue_head(&dd->event_queue); 1029 1030 dd->int_counter = alloc_percpu(u64); 1031 if (!dd->int_counter) { 1032 ret = -ENOMEM; 1033 hfi1_early_err(&pdev->dev, 1034 "Could not allocate per-cpu int_counter\n"); 1035 goto bail; 1036 } 1037 1038 dd->rcv_limit = alloc_percpu(u64); 1039 if (!dd->rcv_limit) { 1040 ret = -ENOMEM; 1041 hfi1_early_err(&pdev->dev, 1042 "Could not allocate per-cpu rcv_limit\n"); 1043 goto bail; 1044 } 1045 1046 if (!hfi1_cpulist_count) { 1047 u32 count = num_online_cpus(); 1048 1049 hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), 1050 GFP_KERNEL); 1051 if (hfi1_cpulist) 1052 hfi1_cpulist_count = count; 1053 else 1054 hfi1_early_err( 1055 &pdev->dev, 1056 "Could not alloc cpulist info, cpu affinity might be wrong\n"); 1057 } 1058 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1059 return dd; 1060 1061bail: 1062 if (!list_empty(&dd->list)) 1063 list_del_init(&dd->list); 1064 ib_dealloc_device(&dd->verbs_dev.ibdev); 1065 return ERR_PTR(ret); 1066} 1067 1068/* 1069 * Called from freeze mode handlers, and from PCI error 1070 * reporting code. Should be paranoid about state of 1071 * system and data structures. 1072 */ 1073void hfi1_disable_after_error(struct hfi1_devdata *dd) 1074{ 1075 if (dd->flags & HFI1_INITTED) { 1076 u32 pidx; 1077 1078 dd->flags &= ~HFI1_INITTED; 1079 if (dd->pport) 1080 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1081 struct hfi1_pportdata *ppd; 1082 1083 ppd = dd->pport + pidx; 1084 if (dd->flags & HFI1_PRESENT) 1085 set_link_state(ppd, HLS_DN_DISABLE); 1086 1087 if (ppd->statusp) 1088 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1089 } 1090 } 1091 1092 /* 1093 * Mark as having had an error for driver, and also 1094 * for /sys and status word mapped to user programs. 1095 * This marks unit as not usable, until reset. 1096 */ 1097 if (dd->status) 1098 dd->status->dev |= HFI1_STATUS_HWERROR; 1099} 1100 1101static void remove_one(struct pci_dev *); 1102static int init_one(struct pci_dev *, const struct pci_device_id *); 1103 1104#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " 1105#define PFX DRIVER_NAME ": " 1106 1107static const struct pci_device_id hfi1_pci_tbl[] = { 1108 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1109 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1110 { 0, } 1111}; 1112 1113MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1114 1115static struct pci_driver hfi1_pci_driver = { 1116 .name = DRIVER_NAME, 1117 .probe = init_one, 1118 .remove = remove_one, 1119 .id_table = hfi1_pci_tbl, 1120 .err_handler = &hfi1_pci_err_handler, 1121}; 1122 1123static void __init compute_krcvqs(void) 1124{ 1125 int i; 1126 1127 for (i = 0; i < krcvqsset; i++) 1128 n_krcvqs += krcvqs[i]; 1129} 1130 1131/* 1132 * Do all the generic driver unit- and chip-independent memory 1133 * allocation and initialization. 1134 */ 1135static int __init hfi1_mod_init(void) 1136{ 1137 int ret; 1138 1139 ret = dev_init(); 1140 if (ret) 1141 goto bail; 1142 1143 /* validate max MTU before any devices start */ 1144 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1145 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1146 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1147 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1148 } 1149 /* valid CUs run from 1-128 in powers of 2 */ 1150 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1151 hfi1_cu = 1; 1152 /* valid credit return threshold is 0-100, variable is unsigned */ 1153 if (user_credit_return_threshold > 100) 1154 user_credit_return_threshold = 100; 1155 1156 compute_krcvqs(); 1157 /* sanitize receive interrupt count, time must wait until after 1158 the hardware type is known */ 1159 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1160 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1161 /* reject invalid combinations */ 1162 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1163 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1164 rcv_intr_count = 1; 1165 } 1166 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1167 /* 1168 * Avoid indefinite packet delivery by requiring a timeout 1169 * if count is > 1. 1170 */ 1171 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1172 rcv_intr_timeout = 1; 1173 } 1174 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1175 /* 1176 * The dynamic algorithm expects a non-zero timeout 1177 * and a count > 1. 1178 */ 1179 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1180 rcv_intr_dynamic = 0; 1181 } 1182 1183 /* sanitize link CRC options */ 1184 link_crc_mask &= SUPPORTED_CRCS; 1185 1186 /* 1187 * These must be called before the driver is registered with 1188 * the PCI subsystem. 1189 */ 1190 idr_init(&hfi1_unit_table); 1191 1192 hfi1_dbg_init(); 1193 ret = pci_register_driver(&hfi1_pci_driver); 1194 if (ret < 0) { 1195 pr_err("Unable to register driver: error %d\n", -ret); 1196 goto bail_dev; 1197 } 1198 goto bail; /* all OK */ 1199 1200bail_dev: 1201 hfi1_dbg_exit(); 1202 idr_destroy(&hfi1_unit_table); 1203 dev_cleanup(); 1204bail: 1205 return ret; 1206} 1207 1208module_init(hfi1_mod_init); 1209 1210/* 1211 * Do the non-unit driver cleanup, memory free, etc. at unload. 1212 */ 1213static void __exit hfi1_mod_cleanup(void) 1214{ 1215 pci_unregister_driver(&hfi1_pci_driver); 1216 hfi1_dbg_exit(); 1217 hfi1_cpulist_count = 0; 1218 kfree(hfi1_cpulist); 1219 1220 idr_destroy(&hfi1_unit_table); 1221 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1222 dev_cleanup(); 1223} 1224 1225module_exit(hfi1_mod_cleanup); 1226 1227/* this can only be called after a successful initialization */ 1228static void cleanup_device_data(struct hfi1_devdata *dd) 1229{ 1230 int ctxt; 1231 int pidx; 1232 struct hfi1_ctxtdata **tmp; 1233 unsigned long flags; 1234 1235 /* users can't do anything more with chip */ 1236 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1237 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1238 struct cc_state *cc_state; 1239 int i; 1240 1241 if (ppd->statusp) 1242 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1243 1244 for (i = 0; i < OPA_MAX_SLS; i++) 1245 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1246 1247 spin_lock(&ppd->cc_state_lock); 1248 cc_state = get_cc_state(ppd); 1249 rcu_assign_pointer(ppd->cc_state, NULL); 1250 spin_unlock(&ppd->cc_state_lock); 1251 1252 if (cc_state) 1253 call_rcu(&cc_state->rcu, cc_state_reclaim); 1254 } 1255 1256 free_credit_return(dd); 1257 1258 /* 1259 * Free any resources still in use (usually just kernel contexts) 1260 * at unload; we do for ctxtcnt, because that's what we allocate. 1261 * We acquire lock to be really paranoid that rcd isn't being 1262 * accessed from some interrupt-related code (that should not happen, 1263 * but best to be sure). 1264 */ 1265 spin_lock_irqsave(&dd->uctxt_lock, flags); 1266 tmp = dd->rcd; 1267 dd->rcd = NULL; 1268 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 1269 for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { 1270 struct hfi1_ctxtdata *rcd = tmp[ctxt]; 1271 1272 tmp[ctxt] = NULL; /* debugging paranoia */ 1273 if (rcd) { 1274 hfi1_clear_tids(rcd); 1275 hfi1_free_ctxtdata(dd, rcd); 1276 } 1277 } 1278 kfree(tmp); 1279 /* must follow rcv context free - need to remove rcv's hooks */ 1280 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1281 sc_free(dd->send_contexts[ctxt].sc); 1282 dd->num_send_contexts = 0; 1283 kfree(dd->send_contexts); 1284 dd->send_contexts = NULL; 1285 kfree(dd->boardname); 1286 vfree(dd->events); 1287 vfree(dd->status); 1288 hfi1_cq_exit(dd); 1289} 1290 1291/* 1292 * Clean up on unit shutdown, or error during unit load after 1293 * successful initialization. 1294 */ 1295static void postinit_cleanup(struct hfi1_devdata *dd) 1296{ 1297 hfi1_start_cleanup(dd); 1298 1299 hfi1_pcie_ddcleanup(dd); 1300 hfi1_pcie_cleanup(dd->pcidev); 1301 1302 cleanup_device_data(dd); 1303 1304 hfi1_free_devdata(dd); 1305} 1306 1307static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1308{ 1309 int ret = 0, j, pidx, initfail; 1310 struct hfi1_devdata *dd = NULL; 1311 1312 /* First, lock the non-writable module parameters */ 1313 HFI1_CAP_LOCK(); 1314 1315 /* Validate some global module parameters */ 1316 if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { 1317 hfi1_early_err(&pdev->dev, "Header queue count too small\n"); 1318 ret = -EINVAL; 1319 goto bail; 1320 } 1321 /* use the encoding function as a sanitization check */ 1322 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1323 hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", 1324 hfi1_hdrq_entsize); 1325 goto bail; 1326 } 1327 1328 /* The receive eager buffer size must be set before the receive 1329 * contexts are created. 1330 * 1331 * Set the eager buffer size. Validate that it falls in a range 1332 * allowed by the hardware - all powers of 2 between the min and 1333 * max. The maximum valid MTU is within the eager buffer range 1334 * so we do not need to cap the max_mtu by an eager buffer size 1335 * setting. 1336 */ 1337 if (eager_buffer_size) { 1338 if (!is_power_of_2(eager_buffer_size)) 1339 eager_buffer_size = 1340 roundup_pow_of_two(eager_buffer_size); 1341 eager_buffer_size = 1342 clamp_val(eager_buffer_size, 1343 MIN_EAGER_BUFFER * 8, 1344 MAX_EAGER_BUFFER_TOTAL); 1345 hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", 1346 eager_buffer_size); 1347 } else { 1348 hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); 1349 ret = -EINVAL; 1350 goto bail; 1351 } 1352 1353 /* restrict value of hfi1_rcvarr_split */ 1354 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1355 1356 ret = hfi1_pcie_init(pdev, ent); 1357 if (ret) 1358 goto bail; 1359 1360 /* 1361 * Do device-specific initialization, function table setup, dd 1362 * allocation, etc. 1363 */ 1364 switch (ent->device) { 1365 case PCI_DEVICE_ID_INTEL0: 1366 case PCI_DEVICE_ID_INTEL1: 1367 dd = hfi1_init_dd(pdev, ent); 1368 break; 1369 default: 1370 hfi1_early_err(&pdev->dev, 1371 "Failing on unknown Intel deviceid 0x%x\n", 1372 ent->device); 1373 ret = -ENODEV; 1374 } 1375 1376 if (IS_ERR(dd)) 1377 ret = PTR_ERR(dd); 1378 if (ret) 1379 goto clean_bail; /* error already printed */ 1380 1381 ret = create_workqueues(dd); 1382 if (ret) 1383 goto clean_bail; 1384 1385 /* do the generic initialization */ 1386 initfail = hfi1_init(dd, 0); 1387 1388 ret = hfi1_register_ib_device(dd); 1389 1390 /* 1391 * Now ready for use. this should be cleared whenever we 1392 * detect a reset, or initiate one. If earlier failure, 1393 * we still create devices, so diags, etc. can be used 1394 * to determine cause of problem. 1395 */ 1396 if (!initfail && !ret) 1397 dd->flags |= HFI1_INITTED; 1398 1399 j = hfi1_device_create(dd); 1400 if (j) 1401 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1402 1403 if (initfail || ret) { 1404 stop_timers(dd); 1405 flush_workqueue(ib_wq); 1406 for (pidx = 0; pidx < dd->num_pports; ++pidx) 1407 hfi1_quiet_serdes(dd->pport + pidx); 1408 if (!j) 1409 hfi1_device_remove(dd); 1410 if (!ret) 1411 hfi1_unregister_ib_device(dd); 1412 postinit_cleanup(dd); 1413 if (initfail) 1414 ret = initfail; 1415 goto bail; /* everything already cleaned */ 1416 } 1417 1418 sdma_start(dd); 1419 1420 return 0; 1421 1422clean_bail: 1423 hfi1_pcie_cleanup(pdev); 1424bail: 1425 return ret; 1426} 1427 1428static void remove_one(struct pci_dev *pdev) 1429{ 1430 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1431 1432 /* unregister from IB core */ 1433 hfi1_unregister_ib_device(dd); 1434 1435 /* 1436 * Disable the IB link, disable interrupts on the device, 1437 * clear dma engines, etc. 1438 */ 1439 shutdown_device(dd); 1440 1441 stop_timers(dd); 1442 1443 /* wait until all of our (qsfp) queue_work() calls complete */ 1444 flush_workqueue(ib_wq); 1445 1446 hfi1_device_remove(dd); 1447 1448 postinit_cleanup(dd); 1449} 1450 1451/** 1452 * hfi1_create_rcvhdrq - create a receive header queue 1453 * @dd: the hfi1_ib device 1454 * @rcd: the context data 1455 * 1456 * This must be contiguous memory (from an i/o perspective), and must be 1457 * DMA'able (which means for some systems, it will go through an IOMMU, 1458 * or be forced into a low address range). 1459 */ 1460int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1461{ 1462 unsigned amt; 1463 u64 reg; 1464 1465 if (!rcd->rcvhdrq) { 1466 dma_addr_t phys_hdrqtail; 1467 gfp_t gfp_flags; 1468 1469 /* 1470 * rcvhdrqentsize is in DWs, so we have to convert to bytes 1471 * (* sizeof(u32)). 1472 */ 1473 amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * 1474 sizeof(u32), PAGE_SIZE); 1475 1476 gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? 1477 GFP_USER : GFP_KERNEL; 1478 rcd->rcvhdrq = dma_zalloc_coherent( 1479 &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, 1480 gfp_flags | __GFP_COMP); 1481 1482 if (!rcd->rcvhdrq) { 1483 dd_dev_err(dd, 1484 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1485 amt, rcd->ctxt); 1486 goto bail; 1487 } 1488 1489 /* Event mask is per device now and is in hfi1_devdata */ 1490 /*if (rcd->ctxt >= dd->first_user_ctxt) { 1491 rcd->user_event_mask = vmalloc_user(PAGE_SIZE); 1492 if (!rcd->user_event_mask) 1493 goto bail_free_hdrq; 1494 }*/ 1495 1496 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { 1497 rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( 1498 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, 1499 gfp_flags); 1500 if (!rcd->rcvhdrtail_kvaddr) 1501 goto bail_free; 1502 rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; 1503 } 1504 1505 rcd->rcvhdrq_size = amt; 1506 } 1507 /* 1508 * These values are per-context: 1509 * RcvHdrCnt 1510 * RcvHdrEntSize 1511 * RcvHdrSize 1512 */ 1513 reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) 1514 & RCV_HDR_CNT_CNT_MASK) 1515 << RCV_HDR_CNT_CNT_SHIFT; 1516 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); 1517 reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) 1518 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) 1519 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; 1520 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); 1521 reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) 1522 << RCV_HDR_SIZE_HDR_SIZE_SHIFT; 1523 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); 1524 return 0; 1525 1526bail_free: 1527 dd_dev_err(dd, 1528 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1529 rcd->ctxt); 1530 vfree(rcd->user_event_mask); 1531 rcd->user_event_mask = NULL; 1532 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1533 rcd->rcvhdrq_phys); 1534 rcd->rcvhdrq = NULL; 1535bail: 1536 return -ENOMEM; 1537} 1538 1539/** 1540 * allocate eager buffers, both kernel and user contexts. 1541 * @rcd: the context we are setting up. 1542 * 1543 * Allocate the eager TID buffers and program them into hip. 1544 * They are no longer completely contiguous, we do multiple allocation 1545 * calls. Otherwise we get the OOM code involved, by asking for too 1546 * much per call, with disastrous results on some kernels. 1547 */ 1548int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1549{ 1550 struct hfi1_devdata *dd = rcd->dd; 1551 u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; 1552 gfp_t gfp_flags; 1553 u16 order; 1554 int ret = 0; 1555 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1556 1557 /* 1558 * GFP_USER, but without GFP_FS, so buffer cache can be 1559 * coalesced (we hope); otherwise, even at order 4, 1560 * heavy filesystem activity makes these fail, and we can 1561 * use compound pages. 1562 */ 1563 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1564 1565 /* 1566 * The minimum size of the eager buffers is a groups of MTU-sized 1567 * buffers. 1568 * The global eager_buffer_size parameter is checked against the 1569 * theoretical lower limit of the value. Here, we check against the 1570 * MTU. 1571 */ 1572 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1573 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1574 /* 1575 * If using one-pkt-per-egr-buffer, lower the eager buffer 1576 * size to the max MTU (page-aligned). 1577 */ 1578 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1579 rcd->egrbufs.rcvtid_size = round_mtu; 1580 1581 /* 1582 * Eager buffers sizes of 1MB or less require smaller TID sizes 1583 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1584 */ 1585 if (rcd->egrbufs.size <= (1 << 20)) 1586 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1587 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1588 1589 while (alloced_bytes < rcd->egrbufs.size && 1590 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1591 rcd->egrbufs.buffers[idx].addr = 1592 dma_zalloc_coherent(&dd->pcidev->dev, 1593 rcd->egrbufs.rcvtid_size, 1594 &rcd->egrbufs.buffers[idx].phys, 1595 gfp_flags); 1596 if (rcd->egrbufs.buffers[idx].addr) { 1597 rcd->egrbufs.buffers[idx].len = 1598 rcd->egrbufs.rcvtid_size; 1599 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1600 rcd->egrbufs.buffers[idx].addr; 1601 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys = 1602 rcd->egrbufs.buffers[idx].phys; 1603 rcd->egrbufs.alloced++; 1604 alloced_bytes += rcd->egrbufs.rcvtid_size; 1605 idx++; 1606 } else { 1607 u32 new_size, i, j; 1608 u64 offset = 0; 1609 1610 /* 1611 * Fail the eager buffer allocation if: 1612 * - we are already using the lowest acceptable size 1613 * - we are using one-pkt-per-egr-buffer (this implies 1614 * that we are accepting only one size) 1615 */ 1616 if (rcd->egrbufs.rcvtid_size == round_mtu || 1617 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 1618 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 1619 rcd->ctxt); 1620 goto bail_rcvegrbuf_phys; 1621 } 1622 1623 new_size = rcd->egrbufs.rcvtid_size / 2; 1624 1625 /* 1626 * If the first attempt to allocate memory failed, don't 1627 * fail everything but continue with the next lower 1628 * size. 1629 */ 1630 if (idx == 0) { 1631 rcd->egrbufs.rcvtid_size = new_size; 1632 continue; 1633 } 1634 1635 /* 1636 * Re-partition already allocated buffers to a smaller 1637 * size. 1638 */ 1639 rcd->egrbufs.alloced = 0; 1640 for (i = 0, j = 0, offset = 0; j < idx; i++) { 1641 if (i >= rcd->egrbufs.count) 1642 break; 1643 rcd->egrbufs.rcvtids[i].phys = 1644 rcd->egrbufs.buffers[j].phys + offset; 1645 rcd->egrbufs.rcvtids[i].addr = 1646 rcd->egrbufs.buffers[j].addr + offset; 1647 rcd->egrbufs.alloced++; 1648 if ((rcd->egrbufs.buffers[j].phys + offset + 1649 new_size) == 1650 (rcd->egrbufs.buffers[j].phys + 1651 rcd->egrbufs.buffers[j].len)) { 1652 j++; 1653 offset = 0; 1654 } else 1655 offset += new_size; 1656 } 1657 rcd->egrbufs.rcvtid_size = new_size; 1658 } 1659 } 1660 rcd->egrbufs.numbufs = idx; 1661 rcd->egrbufs.size = alloced_bytes; 1662 1663 dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", 1664 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, 1665 rcd->egrbufs.size); 1666 1667 /* 1668 * Set the contexts rcv array head update threshold to the closest 1669 * power of 2 (so we can use a mask instead of modulo) below half 1670 * the allocated entries. 1671 */ 1672 rcd->egrbufs.threshold = 1673 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 1674 /* 1675 * Compute the expected RcvArray entry base. This is done after 1676 * allocating the eager buffers in order to maximize the 1677 * expected RcvArray entries for the context. 1678 */ 1679 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 1680 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 1681 rcd->expected_count = max_entries - egrtop; 1682 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 1683 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 1684 1685 rcd->expected_base = rcd->eager_base + egrtop; 1686 dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", 1687 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 1688 rcd->eager_base, rcd->expected_base); 1689 1690 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 1691 dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n", 1692 rcd->ctxt, rcd->egrbufs.rcvtid_size); 1693 ret = -EINVAL; 1694 goto bail; 1695 } 1696 1697 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 1698 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 1699 rcd->egrbufs.rcvtids[idx].phys, order); 1700 cond_resched(); 1701 } 1702 goto bail; 1703 1704bail_rcvegrbuf_phys: 1705 for (idx = 0; idx < rcd->egrbufs.alloced && 1706 rcd->egrbufs.buffers[idx].addr; 1707 idx++) { 1708 dma_free_coherent(&dd->pcidev->dev, 1709 rcd->egrbufs.buffers[idx].len, 1710 rcd->egrbufs.buffers[idx].addr, 1711 rcd->egrbufs.buffers[idx].phys); 1712 rcd->egrbufs.buffers[idx].addr = NULL; 1713 rcd->egrbufs.buffers[idx].phys = 0; 1714 rcd->egrbufs.buffers[idx].len = 0; 1715 } 1716bail: 1717 return ret; 1718} 1719