1/* 2 * Support PCI/PCIe on PowerNV platforms 3 * 4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 12#undef DEBUG 13 14#include <linux/kernel.h> 15#include <linux/pci.h> 16#include <linux/crash_dump.h> 17#include <linux/debugfs.h> 18#include <linux/delay.h> 19#include <linux/string.h> 20#include <linux/init.h> 21#include <linux/bootmem.h> 22#include <linux/irq.h> 23#include <linux/io.h> 24#include <linux/msi.h> 25#include <linux/memblock.h> 26 27#include <asm/sections.h> 28#include <asm/io.h> 29#include <asm/prom.h> 30#include <asm/pci-bridge.h> 31#include <asm/machdep.h> 32#include <asm/msi_bitmap.h> 33#include <asm/ppc-pci.h> 34#include <asm/opal.h> 35#include <asm/iommu.h> 36#include <asm/tce.h> 37#include <asm/xics.h> 38#include <asm/debug.h> 39#include <asm/firmware.h> 40#include <asm/pnv-pci.h> 41 42#include <misc/cxl.h> 43 44#include "powernv.h" 45#include "pci.h" 46 47/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ 48#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) 49 50static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, 51 const char *fmt, ...) 52{ 53 struct va_format vaf; 54 va_list args; 55 char pfix[32]; 56 57 va_start(args, fmt); 58 59 vaf.fmt = fmt; 60 vaf.va = &args; 61 62 if (pe->flags & PNV_IODA_PE_DEV) 63 strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); 64 else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 65 sprintf(pfix, "%04x:%02x ", 66 pci_domain_nr(pe->pbus), pe->pbus->number); 67#ifdef CONFIG_PCI_IOV 68 else if (pe->flags & PNV_IODA_PE_VF) 69 sprintf(pfix, "%04x:%02x:%2x.%d", 70 pci_domain_nr(pe->parent_dev->bus), 71 (pe->rid & 0xff00) >> 8, 72 PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); 73#endif /* CONFIG_PCI_IOV*/ 74 75 printk("%spci %s: [PE# %.3d] %pV", 76 level, pfix, pe->pe_number, &vaf); 77 78 va_end(args); 79} 80 81#define pe_err(pe, fmt, ...) \ 82 pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) 83#define pe_warn(pe, fmt, ...) \ 84 pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) 85#define pe_info(pe, fmt, ...) \ 86 pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) 87 88static bool pnv_iommu_bypass_disabled __read_mostly; 89 90static int __init iommu_setup(char *str) 91{ 92 if (!str) 93 return -EINVAL; 94 95 while (*str) { 96 if (!strncmp(str, "nobypass", 8)) { 97 pnv_iommu_bypass_disabled = true; 98 pr_info("PowerNV: IOMMU bypass window disabled.\n"); 99 break; 100 } 101 str += strcspn(str, ","); 102 if (*str == ',') 103 str++; 104 } 105 106 return 0; 107} 108early_param("iommu", iommu_setup); 109 110/* 111 * stdcix is only supposed to be used in hypervisor real mode as per 112 * the architecture spec 113 */ 114static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) 115{ 116 __asm__ __volatile__("stdcix %0,0,%1" 117 : : "r" (val), "r" (paddr) : "memory"); 118} 119 120static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) 121{ 122 return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == 123 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); 124} 125 126static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) 127{ 128 if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { 129 pr_warn("%s: Invalid PE %d on PHB#%x\n", 130 __func__, pe_no, phb->hose->global_number); 131 return; 132 } 133 134 if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) { 135 pr_warn("%s: PE %d was assigned on PHB#%x\n", 136 __func__, pe_no, phb->hose->global_number); 137 return; 138 } 139 140 phb->ioda.pe_array[pe_no].phb = phb; 141 phb->ioda.pe_array[pe_no].pe_number = pe_no; 142} 143 144static int pnv_ioda_alloc_pe(struct pnv_phb *phb) 145{ 146 unsigned long pe; 147 148 do { 149 pe = find_next_zero_bit(phb->ioda.pe_alloc, 150 phb->ioda.total_pe, 0); 151 if (pe >= phb->ioda.total_pe) 152 return IODA_INVALID_PE; 153 } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); 154 155 phb->ioda.pe_array[pe].phb = phb; 156 phb->ioda.pe_array[pe].pe_number = pe; 157 return pe; 158} 159 160static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) 161{ 162 WARN_ON(phb->ioda.pe_array[pe].pdev); 163 164 memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); 165 clear_bit(pe, phb->ioda.pe_alloc); 166} 167 168/* The default M64 BAR is shared by all PEs */ 169static int pnv_ioda2_init_m64(struct pnv_phb *phb) 170{ 171 const char *desc; 172 struct resource *r; 173 s64 rc; 174 175 /* Configure the default M64 BAR */ 176 rc = opal_pci_set_phb_mem_window(phb->opal_id, 177 OPAL_M64_WINDOW_TYPE, 178 phb->ioda.m64_bar_idx, 179 phb->ioda.m64_base, 180 0, /* unused */ 181 phb->ioda.m64_size); 182 if (rc != OPAL_SUCCESS) { 183 desc = "configuring"; 184 goto fail; 185 } 186 187 /* Enable the default M64 BAR */ 188 rc = opal_pci_phb_mmio_enable(phb->opal_id, 189 OPAL_M64_WINDOW_TYPE, 190 phb->ioda.m64_bar_idx, 191 OPAL_ENABLE_M64_SPLIT); 192 if (rc != OPAL_SUCCESS) { 193 desc = "enabling"; 194 goto fail; 195 } 196 197 /* Mark the M64 BAR assigned */ 198 set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc); 199 200 /* 201 * Strip off the segment used by the reserved PE, which is 202 * expected to be 0 or last one of PE capabicity. 203 */ 204 r = &phb->hose->mem_resources[1]; 205 if (phb->ioda.reserved_pe == 0) 206 r->start += phb->ioda.m64_segsize; 207 else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) 208 r->end -= phb->ioda.m64_segsize; 209 else 210 pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", 211 phb->ioda.reserved_pe); 212 213 return 0; 214 215fail: 216 pr_warn(" Failure %lld %s M64 BAR#%d\n", 217 rc, desc, phb->ioda.m64_bar_idx); 218 opal_pci_phb_mmio_enable(phb->opal_id, 219 OPAL_M64_WINDOW_TYPE, 220 phb->ioda.m64_bar_idx, 221 OPAL_DISABLE_M64); 222 return -EIO; 223} 224 225static void pnv_ioda2_reserve_m64_pe(struct pnv_phb *phb) 226{ 227 resource_size_t sgsz = phb->ioda.m64_segsize; 228 struct pci_dev *pdev; 229 struct resource *r; 230 int base, step, i; 231 232 /* 233 * Root bus always has full M64 range and root port has 234 * M64 range used in reality. So we're checking root port 235 * instead of root bus. 236 */ 237 list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) { 238 for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { 239 r = &pdev->resource[PCI_BRIDGE_RESOURCES + i]; 240 if (!r->parent || 241 !pnv_pci_is_mem_pref_64(r->flags)) 242 continue; 243 244 base = (r->start - phb->ioda.m64_base) / sgsz; 245 for (step = 0; step < resource_size(r) / sgsz; step++) 246 pnv_ioda_reserve_pe(phb, base + step); 247 } 248 } 249} 250 251static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb, 252 struct pci_bus *bus, int all) 253{ 254 resource_size_t segsz = phb->ioda.m64_segsize; 255 struct pci_dev *pdev; 256 struct resource *r; 257 struct pnv_ioda_pe *master_pe, *pe; 258 unsigned long size, *pe_alloc; 259 bool found; 260 int start, i, j; 261 262 /* Root bus shouldn't use M64 */ 263 if (pci_is_root_bus(bus)) 264 return IODA_INVALID_PE; 265 266 /* We support only one M64 window on each bus */ 267 found = false; 268 pci_bus_for_each_resource(bus, r, i) { 269 if (r && r->parent && 270 pnv_pci_is_mem_pref_64(r->flags)) { 271 found = true; 272 break; 273 } 274 } 275 276 /* No M64 window found ? */ 277 if (!found) 278 return IODA_INVALID_PE; 279 280 /* Allocate bitmap */ 281 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 282 pe_alloc = kzalloc(size, GFP_KERNEL); 283 if (!pe_alloc) { 284 pr_warn("%s: Out of memory !\n", 285 __func__); 286 return IODA_INVALID_PE; 287 } 288 289 /* 290 * Figure out reserved PE numbers by the PE 291 * the its child PEs. 292 */ 293 start = (r->start - phb->ioda.m64_base) / segsz; 294 for (i = 0; i < resource_size(r) / segsz; i++) 295 set_bit(start + i, pe_alloc); 296 297 if (all) 298 goto done; 299 300 /* 301 * If the PE doesn't cover all subordinate buses, 302 * we need subtract from reserved PEs for children. 303 */ 304 list_for_each_entry(pdev, &bus->devices, bus_list) { 305 if (!pdev->subordinate) 306 continue; 307 308 pci_bus_for_each_resource(pdev->subordinate, r, i) { 309 if (!r || !r->parent || 310 !pnv_pci_is_mem_pref_64(r->flags)) 311 continue; 312 313 start = (r->start - phb->ioda.m64_base) / segsz; 314 for (j = 0; j < resource_size(r) / segsz ; j++) 315 clear_bit(start + j, pe_alloc); 316 } 317 } 318 319 /* 320 * the current bus might not own M64 window and that's all 321 * contributed by its child buses. For the case, we needn't 322 * pick M64 dependent PE#. 323 */ 324 if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { 325 kfree(pe_alloc); 326 return IODA_INVALID_PE; 327 } 328 329 /* 330 * Figure out the master PE and put all slave PEs to master 331 * PE's list to form compound PE. 332 */ 333done: 334 master_pe = NULL; 335 i = -1; 336 while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < 337 phb->ioda.total_pe) { 338 pe = &phb->ioda.pe_array[i]; 339 340 if (!master_pe) { 341 pe->flags |= PNV_IODA_PE_MASTER; 342 INIT_LIST_HEAD(&pe->slaves); 343 master_pe = pe; 344 } else { 345 pe->flags |= PNV_IODA_PE_SLAVE; 346 pe->master = master_pe; 347 list_add_tail(&pe->list, &master_pe->slaves); 348 } 349 } 350 351 kfree(pe_alloc); 352 return master_pe->pe_number; 353} 354 355static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) 356{ 357 struct pci_controller *hose = phb->hose; 358 struct device_node *dn = hose->dn; 359 struct resource *res; 360 const u32 *r; 361 u64 pci_addr; 362 363 /* FIXME: Support M64 for P7IOC */ 364 if (phb->type != PNV_PHB_IODA2) { 365 pr_info(" Not support M64 window\n"); 366 return; 367 } 368 369 if (!firmware_has_feature(FW_FEATURE_OPALv3)) { 370 pr_info(" Firmware too old to support M64 window\n"); 371 return; 372 } 373 374 r = of_get_property(dn, "ibm,opal-m64-window", NULL); 375 if (!r) { 376 pr_info(" No <ibm,opal-m64-window> on %s\n", 377 dn->full_name); 378 return; 379 } 380 381 res = &hose->mem_resources[1]; 382 res->start = of_translate_address(dn, r + 2); 383 res->end = res->start + of_read_number(r + 4, 2) - 1; 384 res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); 385 pci_addr = of_read_number(r, 2); 386 hose->mem_offset[1] = res->start - pci_addr; 387 388 phb->ioda.m64_size = resource_size(res); 389 phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; 390 phb->ioda.m64_base = pci_addr; 391 392 pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", 393 res->start, res->end, pci_addr); 394 395 /* Use last M64 BAR to cover M64 window */ 396 phb->ioda.m64_bar_idx = 15; 397 phb->init_m64 = pnv_ioda2_init_m64; 398 phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; 399 phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; 400} 401 402static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) 403{ 404 struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; 405 struct pnv_ioda_pe *slave; 406 s64 rc; 407 408 /* Fetch master PE */ 409 if (pe->flags & PNV_IODA_PE_SLAVE) { 410 pe = pe->master; 411 if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER))) 412 return; 413 414 pe_no = pe->pe_number; 415 } 416 417 /* Freeze master PE */ 418 rc = opal_pci_eeh_freeze_set(phb->opal_id, 419 pe_no, 420 OPAL_EEH_ACTION_SET_FREEZE_ALL); 421 if (rc != OPAL_SUCCESS) { 422 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 423 __func__, rc, phb->hose->global_number, pe_no); 424 return; 425 } 426 427 /* Freeze slave PEs */ 428 if (!(pe->flags & PNV_IODA_PE_MASTER)) 429 return; 430 431 list_for_each_entry(slave, &pe->slaves, list) { 432 rc = opal_pci_eeh_freeze_set(phb->opal_id, 433 slave->pe_number, 434 OPAL_EEH_ACTION_SET_FREEZE_ALL); 435 if (rc != OPAL_SUCCESS) 436 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", 437 __func__, rc, phb->hose->global_number, 438 slave->pe_number); 439 } 440} 441 442static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) 443{ 444 struct pnv_ioda_pe *pe, *slave; 445 s64 rc; 446 447 /* Find master PE */ 448 pe = &phb->ioda.pe_array[pe_no]; 449 if (pe->flags & PNV_IODA_PE_SLAVE) { 450 pe = pe->master; 451 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 452 pe_no = pe->pe_number; 453 } 454 455 /* Clear frozen state for master PE */ 456 rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt); 457 if (rc != OPAL_SUCCESS) { 458 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 459 __func__, rc, opt, phb->hose->global_number, pe_no); 460 return -EIO; 461 } 462 463 if (!(pe->flags & PNV_IODA_PE_MASTER)) 464 return 0; 465 466 /* Clear frozen state for slave PEs */ 467 list_for_each_entry(slave, &pe->slaves, list) { 468 rc = opal_pci_eeh_freeze_clear(phb->opal_id, 469 slave->pe_number, 470 opt); 471 if (rc != OPAL_SUCCESS) { 472 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", 473 __func__, rc, opt, phb->hose->global_number, 474 slave->pe_number); 475 return -EIO; 476 } 477 } 478 479 return 0; 480} 481 482static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) 483{ 484 struct pnv_ioda_pe *slave, *pe; 485 u8 fstate, state; 486 __be16 pcierr; 487 s64 rc; 488 489 /* Sanity check on PE number */ 490 if (pe_no < 0 || pe_no >= phb->ioda.total_pe) 491 return OPAL_EEH_STOPPED_PERM_UNAVAIL; 492 493 /* 494 * Fetch the master PE and the PE instance might be 495 * not initialized yet. 496 */ 497 pe = &phb->ioda.pe_array[pe_no]; 498 if (pe->flags & PNV_IODA_PE_SLAVE) { 499 pe = pe->master; 500 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); 501 pe_no = pe->pe_number; 502 } 503 504 /* Check the master PE */ 505 rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, 506 &state, &pcierr, NULL); 507 if (rc != OPAL_SUCCESS) { 508 pr_warn("%s: Failure %lld getting " 509 "PHB#%x-PE#%x state\n", 510 __func__, rc, 511 phb->hose->global_number, pe_no); 512 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 513 } 514 515 /* Check the slave PE */ 516 if (!(pe->flags & PNV_IODA_PE_MASTER)) 517 return state; 518 519 list_for_each_entry(slave, &pe->slaves, list) { 520 rc = opal_pci_eeh_freeze_status(phb->opal_id, 521 slave->pe_number, 522 &fstate, 523 &pcierr, 524 NULL); 525 if (rc != OPAL_SUCCESS) { 526 pr_warn("%s: Failure %lld getting " 527 "PHB#%x-PE#%x state\n", 528 __func__, rc, 529 phb->hose->global_number, slave->pe_number); 530 return OPAL_EEH_STOPPED_TEMP_UNAVAIL; 531 } 532 533 /* 534 * Override the result based on the ascending 535 * priority. 536 */ 537 if (fstate > state) 538 state = fstate; 539 } 540 541 return state; 542} 543 544/* Currently those 2 are only used when MSIs are enabled, this will change 545 * but in the meantime, we need to protect them to avoid warnings 546 */ 547#ifdef CONFIG_PCI_MSI 548static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) 549{ 550 struct pci_controller *hose = pci_bus_to_host(dev->bus); 551 struct pnv_phb *phb = hose->private_data; 552 struct pci_dn *pdn = pci_get_pdn(dev); 553 554 if (!pdn) 555 return NULL; 556 if (pdn->pe_number == IODA_INVALID_PE) 557 return NULL; 558 return &phb->ioda.pe_array[pdn->pe_number]; 559} 560#endif /* CONFIG_PCI_MSI */ 561 562static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, 563 struct pnv_ioda_pe *parent, 564 struct pnv_ioda_pe *child, 565 bool is_add) 566{ 567 const char *desc = is_add ? "adding" : "removing"; 568 uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : 569 OPAL_REMOVE_PE_FROM_DOMAIN; 570 struct pnv_ioda_pe *slave; 571 long rc; 572 573 /* Parent PE affects child PE */ 574 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, 575 child->pe_number, op); 576 if (rc != OPAL_SUCCESS) { 577 pe_warn(child, "OPAL error %ld %s to parent PELTV\n", 578 rc, desc); 579 return -ENXIO; 580 } 581 582 if (!(child->flags & PNV_IODA_PE_MASTER)) 583 return 0; 584 585 /* Compound case: parent PE affects slave PEs */ 586 list_for_each_entry(slave, &child->slaves, list) { 587 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, 588 slave->pe_number, op); 589 if (rc != OPAL_SUCCESS) { 590 pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", 591 rc, desc); 592 return -ENXIO; 593 } 594 } 595 596 return 0; 597} 598 599static int pnv_ioda_set_peltv(struct pnv_phb *phb, 600 struct pnv_ioda_pe *pe, 601 bool is_add) 602{ 603 struct pnv_ioda_pe *slave; 604 struct pci_dev *pdev = NULL; 605 int ret; 606 607 /* 608 * Clear PE frozen state. If it's master PE, we need 609 * clear slave PE frozen state as well. 610 */ 611 if (is_add) { 612 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, 613 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 614 if (pe->flags & PNV_IODA_PE_MASTER) { 615 list_for_each_entry(slave, &pe->slaves, list) 616 opal_pci_eeh_freeze_clear(phb->opal_id, 617 slave->pe_number, 618 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 619 } 620 } 621 622 /* 623 * Associate PE in PELT. We need add the PE into the 624 * corresponding PELT-V as well. Otherwise, the error 625 * originated from the PE might contribute to other 626 * PEs. 627 */ 628 ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); 629 if (ret) 630 return ret; 631 632 /* For compound PEs, any one affects all of them */ 633 if (pe->flags & PNV_IODA_PE_MASTER) { 634 list_for_each_entry(slave, &pe->slaves, list) { 635 ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); 636 if (ret) 637 return ret; 638 } 639 } 640 641 if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) 642 pdev = pe->pbus->self; 643 else if (pe->flags & PNV_IODA_PE_DEV) 644 pdev = pe->pdev->bus->self; 645#ifdef CONFIG_PCI_IOV 646 else if (pe->flags & PNV_IODA_PE_VF) 647 pdev = pe->parent_dev->bus->self; 648#endif /* CONFIG_PCI_IOV */ 649 while (pdev) { 650 struct pci_dn *pdn = pci_get_pdn(pdev); 651 struct pnv_ioda_pe *parent; 652 653 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 654 parent = &phb->ioda.pe_array[pdn->pe_number]; 655 ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); 656 if (ret) 657 return ret; 658 } 659 660 pdev = pdev->bus->self; 661 } 662 663 return 0; 664} 665 666#ifdef CONFIG_PCI_IOV 667static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 668{ 669 struct pci_dev *parent; 670 uint8_t bcomp, dcomp, fcomp; 671 int64_t rc; 672 long rid_end, rid; 673 674 /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ 675 if (pe->pbus) { 676 int count; 677 678 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 679 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 680 parent = pe->pbus->self; 681 if (pe->flags & PNV_IODA_PE_BUS_ALL) 682 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 683 else 684 count = 1; 685 686 switch(count) { 687 case 1: bcomp = OpalPciBusAll; break; 688 case 2: bcomp = OpalPciBus7Bits; break; 689 case 4: bcomp = OpalPciBus6Bits; break; 690 case 8: bcomp = OpalPciBus5Bits; break; 691 case 16: bcomp = OpalPciBus4Bits; break; 692 case 32: bcomp = OpalPciBus3Bits; break; 693 default: 694 dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", 695 count); 696 /* Do an exact match only */ 697 bcomp = OpalPciBusAll; 698 } 699 rid_end = pe->rid + (count << 8); 700 } else { 701 if (pe->flags & PNV_IODA_PE_VF) 702 parent = pe->parent_dev; 703 else 704 parent = pe->pdev->bus->self; 705 bcomp = OpalPciBusAll; 706 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 707 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 708 rid_end = pe->rid + 1; 709 } 710 711 /* Clear the reverse map */ 712 for (rid = pe->rid; rid < rid_end; rid++) 713 phb->ioda.pe_rmap[rid] = 0; 714 715 /* Release from all parents PELT-V */ 716 while (parent) { 717 struct pci_dn *pdn = pci_get_pdn(parent); 718 if (pdn && pdn->pe_number != IODA_INVALID_PE) { 719 rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, 720 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); 721 /* XXX What to do in case of error ? */ 722 } 723 parent = parent->bus->self; 724 } 725 726 opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, 727 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); 728 729 /* Disassociate PE in PELT */ 730 rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, 731 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); 732 if (rc) 733 pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); 734 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 735 bcomp, dcomp, fcomp, OPAL_UNMAP_PE); 736 if (rc) 737 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 738 739 pe->pbus = NULL; 740 pe->pdev = NULL; 741 pe->parent_dev = NULL; 742 743 return 0; 744} 745#endif /* CONFIG_PCI_IOV */ 746 747static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) 748{ 749 struct pci_dev *parent; 750 uint8_t bcomp, dcomp, fcomp; 751 long rc, rid_end, rid; 752 753 /* Bus validation ? */ 754 if (pe->pbus) { 755 int count; 756 757 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; 758 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; 759 parent = pe->pbus->self; 760 if (pe->flags & PNV_IODA_PE_BUS_ALL) 761 count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; 762 else 763 count = 1; 764 765 switch(count) { 766 case 1: bcomp = OpalPciBusAll; break; 767 case 2: bcomp = OpalPciBus7Bits; break; 768 case 4: bcomp = OpalPciBus6Bits; break; 769 case 8: bcomp = OpalPciBus5Bits; break; 770 case 16: bcomp = OpalPciBus4Bits; break; 771 case 32: bcomp = OpalPciBus3Bits; break; 772 default: 773 dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", 774 count); 775 /* Do an exact match only */ 776 bcomp = OpalPciBusAll; 777 } 778 rid_end = pe->rid + (count << 8); 779 } else { 780#ifdef CONFIG_PCI_IOV 781 if (pe->flags & PNV_IODA_PE_VF) 782 parent = pe->parent_dev; 783 else 784#endif /* CONFIG_PCI_IOV */ 785 parent = pe->pdev->bus->self; 786 bcomp = OpalPciBusAll; 787 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; 788 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; 789 rid_end = pe->rid + 1; 790 } 791 792 /* 793 * Associate PE in PELT. We need add the PE into the 794 * corresponding PELT-V as well. Otherwise, the error 795 * originated from the PE might contribute to other 796 * PEs. 797 */ 798 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, 799 bcomp, dcomp, fcomp, OPAL_MAP_PE); 800 if (rc) { 801 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); 802 return -ENXIO; 803 } 804 805 /* Configure PELTV */ 806 pnv_ioda_set_peltv(phb, pe, true); 807 808 /* Setup reverse map */ 809 for (rid = pe->rid; rid < rid_end; rid++) 810 phb->ioda.pe_rmap[rid] = pe->pe_number; 811 812 /* Setup one MVTs on IODA1 */ 813 if (phb->type != PNV_PHB_IODA1) { 814 pe->mve_number = 0; 815 goto out; 816 } 817 818 pe->mve_number = pe->pe_number; 819 rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number); 820 if (rc != OPAL_SUCCESS) { 821 pe_err(pe, "OPAL error %ld setting up MVE %d\n", 822 rc, pe->mve_number); 823 pe->mve_number = -1; 824 } else { 825 rc = opal_pci_set_mve_enable(phb->opal_id, 826 pe->mve_number, OPAL_ENABLE_MVE); 827 if (rc) { 828 pe_err(pe, "OPAL error %ld enabling MVE %d\n", 829 rc, pe->mve_number); 830 pe->mve_number = -1; 831 } 832 } 833 834out: 835 return 0; 836} 837 838static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, 839 struct pnv_ioda_pe *pe) 840{ 841 struct pnv_ioda_pe *lpe; 842 843 list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { 844 if (lpe->dma_weight < pe->dma_weight) { 845 list_add_tail(&pe->dma_link, &lpe->dma_link); 846 return; 847 } 848 } 849 list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); 850} 851 852static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) 853{ 854 /* This is quite simplistic. The "base" weight of a device 855 * is 10. 0 means no DMA is to be accounted for it. 856 */ 857 858 /* If it's a bridge, no DMA */ 859 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 860 return 0; 861 862 /* Reduce the weight of slow USB controllers */ 863 if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 864 dev->class == PCI_CLASS_SERIAL_USB_OHCI || 865 dev->class == PCI_CLASS_SERIAL_USB_EHCI) 866 return 3; 867 868 /* Increase the weight of RAID (includes Obsidian) */ 869 if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 870 return 15; 871 872 /* Default */ 873 return 10; 874} 875 876#ifdef CONFIG_PCI_IOV 877static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) 878{ 879 struct pci_dn *pdn = pci_get_pdn(dev); 880 int i; 881 struct resource *res, res2; 882 resource_size_t size; 883 u16 num_vfs; 884 885 if (!dev->is_physfn) 886 return -EINVAL; 887 888 /* 889 * "offset" is in VFs. The M64 windows are sized so that when they 890 * are segmented, each segment is the same size as the IOV BAR. 891 * Each segment is in a separate PE, and the high order bits of the 892 * address are the PE number. Therefore, each VF's BAR is in a 893 * separate PE, and changing the IOV BAR start address changes the 894 * range of PEs the VFs are in. 895 */ 896 num_vfs = pdn->num_vfs; 897 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 898 res = &dev->resource[i + PCI_IOV_RESOURCES]; 899 if (!res->flags || !res->parent) 900 continue; 901 902 if (!pnv_pci_is_mem_pref_64(res->flags)) 903 continue; 904 905 /* 906 * The actual IOV BAR range is determined by the start address 907 * and the actual size for num_vfs VFs BAR. This check is to 908 * make sure that after shifting, the range will not overlap 909 * with another device. 910 */ 911 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); 912 res2.flags = res->flags; 913 res2.start = res->start + (size * offset); 914 res2.end = res2.start + (size * num_vfs) - 1; 915 916 if (res2.end > res->end) { 917 dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", 918 i, &res2, res, num_vfs, offset); 919 return -EBUSY; 920 } 921 } 922 923 /* 924 * After doing so, there would be a "hole" in the /proc/iomem when 925 * offset is a positive value. It looks like the device return some 926 * mmio back to the system, which actually no one could use it. 927 */ 928 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 929 res = &dev->resource[i + PCI_IOV_RESOURCES]; 930 if (!res->flags || !res->parent) 931 continue; 932 933 if (!pnv_pci_is_mem_pref_64(res->flags)) 934 continue; 935 936 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); 937 res2 = *res; 938 res->start += size * offset; 939 940 dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", 941 i, &res2, res, num_vfs, offset); 942 pci_update_resource(dev, i + PCI_IOV_RESOURCES); 943 } 944 return 0; 945} 946#endif /* CONFIG_PCI_IOV */ 947 948#if 0 949static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) 950{ 951 struct pci_controller *hose = pci_bus_to_host(dev->bus); 952 struct pnv_phb *phb = hose->private_data; 953 struct pci_dn *pdn = pci_get_pdn(dev); 954 struct pnv_ioda_pe *pe; 955 int pe_num; 956 957 if (!pdn) { 958 pr_err("%s: Device tree node not associated properly\n", 959 pci_name(dev)); 960 return NULL; 961 } 962 if (pdn->pe_number != IODA_INVALID_PE) 963 return NULL; 964 965 /* PE#0 has been pre-set */ 966 if (dev->bus->number == 0) 967 pe_num = 0; 968 else 969 pe_num = pnv_ioda_alloc_pe(phb); 970 if (pe_num == IODA_INVALID_PE) { 971 pr_warning("%s: Not enough PE# available, disabling device\n", 972 pci_name(dev)); 973 return NULL; 974 } 975 976 /* NOTE: We get only one ref to the pci_dev for the pdn, not for the 977 * pointer in the PE data structure, both should be destroyed at the 978 * same time. However, this needs to be looked at more closely again 979 * once we actually start removing things (Hotplug, SR-IOV, ...) 980 * 981 * At some point we want to remove the PDN completely anyways 982 */ 983 pe = &phb->ioda.pe_array[pe_num]; 984 pci_dev_get(dev); 985 pdn->pcidev = dev; 986 pdn->pe_number = pe_num; 987 pe->pdev = dev; 988 pe->pbus = NULL; 989 pe->tce32_seg = -1; 990 pe->mve_number = -1; 991 pe->rid = dev->bus->number << 8 | pdn->devfn; 992 993 pe_info(pe, "Associated device to PE\n"); 994 995 if (pnv_ioda_configure_pe(phb, pe)) { 996 /* XXX What do we do here ? */ 997 if (pe_num) 998 pnv_ioda_free_pe(phb, pe_num); 999 pdn->pe_number = IODA_INVALID_PE; 1000 pe->pdev = NULL; 1001 pci_dev_put(dev); 1002 return NULL; 1003 } 1004 1005 /* Assign a DMA weight to the device */ 1006 pe->dma_weight = pnv_ioda_dma_weight(dev); 1007 if (pe->dma_weight != 0) { 1008 phb->ioda.dma_weight += pe->dma_weight; 1009 phb->ioda.dma_pe_count++; 1010 } 1011 1012 /* Link the PE */ 1013 pnv_ioda_link_pe_by_weight(phb, pe); 1014 1015 return pe; 1016} 1017#endif /* Useful for SRIOV case */ 1018 1019static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) 1020{ 1021 struct pci_dev *dev; 1022 1023 list_for_each_entry(dev, &bus->devices, bus_list) { 1024 struct pci_dn *pdn = pci_get_pdn(dev); 1025 1026 if (pdn == NULL) { 1027 pr_warn("%s: No device node associated with device !\n", 1028 pci_name(dev)); 1029 continue; 1030 } 1031 pdn->pe_number = pe->pe_number; 1032 pe->dma_weight += pnv_ioda_dma_weight(dev); 1033 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 1034 pnv_ioda_setup_same_PE(dev->subordinate, pe); 1035 } 1036} 1037 1038/* 1039 * There're 2 types of PCI bus sensitive PEs: One that is compromised of 1040 * single PCI bus. Another one that contains the primary PCI bus and its 1041 * subordinate PCI devices and buses. The second type of PE is normally 1042 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. 1043 */ 1044static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) 1045{ 1046 struct pci_controller *hose = pci_bus_to_host(bus); 1047 struct pnv_phb *phb = hose->private_data; 1048 struct pnv_ioda_pe *pe; 1049 int pe_num = IODA_INVALID_PE; 1050 1051 /* Check if PE is determined by M64 */ 1052 if (phb->pick_m64_pe) 1053 pe_num = phb->pick_m64_pe(phb, bus, all); 1054 1055 /* The PE number isn't pinned by M64 */ 1056 if (pe_num == IODA_INVALID_PE) 1057 pe_num = pnv_ioda_alloc_pe(phb); 1058 1059 if (pe_num == IODA_INVALID_PE) { 1060 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", 1061 __func__, pci_domain_nr(bus), bus->number); 1062 return; 1063 } 1064 1065 pe = &phb->ioda.pe_array[pe_num]; 1066 pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); 1067 pe->pbus = bus; 1068 pe->pdev = NULL; 1069 pe->tce32_seg = -1; 1070 pe->mve_number = -1; 1071 pe->rid = bus->busn_res.start << 8; 1072 pe->dma_weight = 0; 1073 1074 if (all) 1075 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", 1076 bus->busn_res.start, bus->busn_res.end, pe_num); 1077 else 1078 pe_info(pe, "Secondary bus %d associated with PE#%d\n", 1079 bus->busn_res.start, pe_num); 1080 1081 if (pnv_ioda_configure_pe(phb, pe)) { 1082 /* XXX What do we do here ? */ 1083 if (pe_num) 1084 pnv_ioda_free_pe(phb, pe_num); 1085 pe->pbus = NULL; 1086 return; 1087 } 1088 1089 pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), 1090 GFP_KERNEL, hose->node); 1091 pe->tce32_table->data = pe; 1092 1093 /* Associate it with all child devices */ 1094 pnv_ioda_setup_same_PE(bus, pe); 1095 1096 /* Put PE to the list */ 1097 list_add_tail(&pe->list, &phb->ioda.pe_list); 1098 1099 /* Account for one DMA PE if at least one DMA capable device exist 1100 * below the bridge 1101 */ 1102 if (pe->dma_weight != 0) { 1103 phb->ioda.dma_weight += pe->dma_weight; 1104 phb->ioda.dma_pe_count++; 1105 } 1106 1107 /* Link the PE */ 1108 pnv_ioda_link_pe_by_weight(phb, pe); 1109} 1110 1111static void pnv_ioda_setup_PEs(struct pci_bus *bus) 1112{ 1113 struct pci_dev *dev; 1114 1115 pnv_ioda_setup_bus_PE(bus, 0); 1116 1117 list_for_each_entry(dev, &bus->devices, bus_list) { 1118 if (dev->subordinate) { 1119 if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) 1120 pnv_ioda_setup_bus_PE(dev->subordinate, 1); 1121 else 1122 pnv_ioda_setup_PEs(dev->subordinate); 1123 } 1124 } 1125} 1126 1127/* 1128 * Configure PEs so that the downstream PCI buses and devices 1129 * could have their associated PE#. Unfortunately, we didn't 1130 * figure out the way to identify the PLX bridge yet. So we 1131 * simply put the PCI bus and the subordinate behind the root 1132 * port to PE# here. The game rule here is expected to be changed 1133 * as soon as we can detected PLX bridge correctly. 1134 */ 1135static void pnv_pci_ioda_setup_PEs(void) 1136{ 1137 struct pci_controller *hose, *tmp; 1138 struct pnv_phb *phb; 1139 1140 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 1141 phb = hose->private_data; 1142 1143 /* M64 layout might affect PE allocation */ 1144 if (phb->reserve_m64_pe) 1145 phb->reserve_m64_pe(phb); 1146 1147 pnv_ioda_setup_PEs(hose->bus); 1148 } 1149} 1150 1151#ifdef CONFIG_PCI_IOV 1152static int pnv_pci_vf_release_m64(struct pci_dev *pdev) 1153{ 1154 struct pci_bus *bus; 1155 struct pci_controller *hose; 1156 struct pnv_phb *phb; 1157 struct pci_dn *pdn; 1158 int i, j; 1159 1160 bus = pdev->bus; 1161 hose = pci_bus_to_host(bus); 1162 phb = hose->private_data; 1163 pdn = pci_get_pdn(pdev); 1164 1165 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) 1166 for (j = 0; j < M64_PER_IOV; j++) { 1167 if (pdn->m64_wins[i][j] == IODA_INVALID_M64) 1168 continue; 1169 opal_pci_phb_mmio_enable(phb->opal_id, 1170 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); 1171 clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); 1172 pdn->m64_wins[i][j] = IODA_INVALID_M64; 1173 } 1174 1175 return 0; 1176} 1177 1178static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) 1179{ 1180 struct pci_bus *bus; 1181 struct pci_controller *hose; 1182 struct pnv_phb *phb; 1183 struct pci_dn *pdn; 1184 unsigned int win; 1185 struct resource *res; 1186 int i, j; 1187 int64_t rc; 1188 int total_vfs; 1189 resource_size_t size, start; 1190 int pe_num; 1191 int vf_groups; 1192 int vf_per_group; 1193 1194 bus = pdev->bus; 1195 hose = pci_bus_to_host(bus); 1196 phb = hose->private_data; 1197 pdn = pci_get_pdn(pdev); 1198 total_vfs = pci_sriov_get_totalvfs(pdev); 1199 1200 /* Initialize the m64_wins to IODA_INVALID_M64 */ 1201 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) 1202 for (j = 0; j < M64_PER_IOV; j++) 1203 pdn->m64_wins[i][j] = IODA_INVALID_M64; 1204 1205 if (pdn->m64_per_iov == M64_PER_IOV) { 1206 vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; 1207 vf_per_group = (num_vfs <= M64_PER_IOV)? 1: 1208 roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1209 } else { 1210 vf_groups = 1; 1211 vf_per_group = 1; 1212 } 1213 1214 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 1215 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 1216 if (!res->flags || !res->parent) 1217 continue; 1218 1219 if (!pnv_pci_is_mem_pref_64(res->flags)) 1220 continue; 1221 1222 for (j = 0; j < vf_groups; j++) { 1223 do { 1224 win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, 1225 phb->ioda.m64_bar_idx + 1, 0); 1226 1227 if (win >= phb->ioda.m64_bar_idx + 1) 1228 goto m64_failed; 1229 } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); 1230 1231 pdn->m64_wins[i][j] = win; 1232 1233 if (pdn->m64_per_iov == M64_PER_IOV) { 1234 size = pci_iov_resource_size(pdev, 1235 PCI_IOV_RESOURCES + i); 1236 size = size * vf_per_group; 1237 start = res->start + size * j; 1238 } else { 1239 size = resource_size(res); 1240 start = res->start; 1241 } 1242 1243 /* Map the M64 here */ 1244 if (pdn->m64_per_iov == M64_PER_IOV) { 1245 pe_num = pdn->offset + j; 1246 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 1247 pe_num, OPAL_M64_WINDOW_TYPE, 1248 pdn->m64_wins[i][j], 0); 1249 } 1250 1251 rc = opal_pci_set_phb_mem_window(phb->opal_id, 1252 OPAL_M64_WINDOW_TYPE, 1253 pdn->m64_wins[i][j], 1254 start, 1255 0, /* unused */ 1256 size); 1257 1258 1259 if (rc != OPAL_SUCCESS) { 1260 dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", 1261 win, rc); 1262 goto m64_failed; 1263 } 1264 1265 if (pdn->m64_per_iov == M64_PER_IOV) 1266 rc = opal_pci_phb_mmio_enable(phb->opal_id, 1267 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); 1268 else 1269 rc = opal_pci_phb_mmio_enable(phb->opal_id, 1270 OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); 1271 1272 if (rc != OPAL_SUCCESS) { 1273 dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", 1274 win, rc); 1275 goto m64_failed; 1276 } 1277 } 1278 } 1279 return 0; 1280 1281m64_failed: 1282 pnv_pci_vf_release_m64(pdev); 1283 return -EBUSY; 1284} 1285 1286static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) 1287{ 1288 struct pci_bus *bus; 1289 struct pci_controller *hose; 1290 struct pnv_phb *phb; 1291 struct iommu_table *tbl; 1292 unsigned long addr; 1293 int64_t rc; 1294 1295 bus = dev->bus; 1296 hose = pci_bus_to_host(bus); 1297 phb = hose->private_data; 1298 tbl = pe->tce32_table; 1299 addr = tbl->it_base; 1300 1301 opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 1302 pe->pe_number << 1, 1, __pa(addr), 1303 0, 0x1000); 1304 1305 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1306 pe->pe_number, 1307 (pe->pe_number << 1) + 1, 1308 pe->tce_bypass_base, 1309 0); 1310 if (rc) 1311 pe_warn(pe, "OPAL error %ld release DMA window\n", rc); 1312 1313 iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); 1314 free_pages(addr, get_order(TCE32_TABLE_SIZE)); 1315 pe->tce32_table = NULL; 1316} 1317 1318static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) 1319{ 1320 struct pci_bus *bus; 1321 struct pci_controller *hose; 1322 struct pnv_phb *phb; 1323 struct pnv_ioda_pe *pe, *pe_n; 1324 struct pci_dn *pdn; 1325 u16 vf_index; 1326 int64_t rc; 1327 1328 bus = pdev->bus; 1329 hose = pci_bus_to_host(bus); 1330 phb = hose->private_data; 1331 pdn = pci_get_pdn(pdev); 1332 1333 if (!pdev->is_physfn) 1334 return; 1335 1336 if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { 1337 int vf_group; 1338 int vf_per_group; 1339 int vf_index1; 1340 1341 vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1342 1343 for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) 1344 for (vf_index = vf_group * vf_per_group; 1345 vf_index < (vf_group + 1) * vf_per_group && 1346 vf_index < num_vfs; 1347 vf_index++) 1348 for (vf_index1 = vf_group * vf_per_group; 1349 vf_index1 < (vf_group + 1) * vf_per_group && 1350 vf_index1 < num_vfs; 1351 vf_index1++){ 1352 1353 rc = opal_pci_set_peltv(phb->opal_id, 1354 pdn->offset + vf_index, 1355 pdn->offset + vf_index1, 1356 OPAL_REMOVE_PE_FROM_DOMAIN); 1357 1358 if (rc) 1359 dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", 1360 __func__, 1361 pdn->offset + vf_index1, rc); 1362 } 1363 } 1364 1365 list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { 1366 if (pe->parent_dev != pdev) 1367 continue; 1368 1369 pnv_pci_ioda2_release_dma_pe(pdev, pe); 1370 1371 /* Remove from list */ 1372 mutex_lock(&phb->ioda.pe_list_mutex); 1373 list_del(&pe->list); 1374 mutex_unlock(&phb->ioda.pe_list_mutex); 1375 1376 pnv_ioda_deconfigure_pe(phb, pe); 1377 1378 pnv_ioda_free_pe(phb, pe->pe_number); 1379 } 1380} 1381 1382void pnv_pci_sriov_disable(struct pci_dev *pdev) 1383{ 1384 struct pci_bus *bus; 1385 struct pci_controller *hose; 1386 struct pnv_phb *phb; 1387 struct pci_dn *pdn; 1388 struct pci_sriov *iov; 1389 u16 num_vfs; 1390 1391 bus = pdev->bus; 1392 hose = pci_bus_to_host(bus); 1393 phb = hose->private_data; 1394 pdn = pci_get_pdn(pdev); 1395 iov = pdev->sriov; 1396 num_vfs = pdn->num_vfs; 1397 1398 /* Release VF PEs */ 1399 pnv_ioda_release_vf_PE(pdev, num_vfs); 1400 1401 if (phb->type == PNV_PHB_IODA2) { 1402 if (pdn->m64_per_iov == 1) 1403 pnv_pci_vf_resource_shift(pdev, -pdn->offset); 1404 1405 /* Release M64 windows */ 1406 pnv_pci_vf_release_m64(pdev); 1407 1408 /* Release PE numbers */ 1409 bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1410 pdn->offset = 0; 1411 } 1412} 1413 1414static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 1415 struct pnv_ioda_pe *pe); 1416static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) 1417{ 1418 struct pci_bus *bus; 1419 struct pci_controller *hose; 1420 struct pnv_phb *phb; 1421 struct pnv_ioda_pe *pe; 1422 int pe_num; 1423 u16 vf_index; 1424 struct pci_dn *pdn; 1425 int64_t rc; 1426 1427 bus = pdev->bus; 1428 hose = pci_bus_to_host(bus); 1429 phb = hose->private_data; 1430 pdn = pci_get_pdn(pdev); 1431 1432 if (!pdev->is_physfn) 1433 return; 1434 1435 /* Reserve PE for each VF */ 1436 for (vf_index = 0; vf_index < num_vfs; vf_index++) { 1437 pe_num = pdn->offset + vf_index; 1438 1439 pe = &phb->ioda.pe_array[pe_num]; 1440 pe->pe_number = pe_num; 1441 pe->phb = phb; 1442 pe->flags = PNV_IODA_PE_VF; 1443 pe->pbus = NULL; 1444 pe->parent_dev = pdev; 1445 pe->tce32_seg = -1; 1446 pe->mve_number = -1; 1447 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | 1448 pci_iov_virtfn_devfn(pdev, vf_index); 1449 1450 pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", 1451 hose->global_number, pdev->bus->number, 1452 PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), 1453 PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); 1454 1455 if (pnv_ioda_configure_pe(phb, pe)) { 1456 /* XXX What do we do here ? */ 1457 if (pe_num) 1458 pnv_ioda_free_pe(phb, pe_num); 1459 pe->pdev = NULL; 1460 continue; 1461 } 1462 1463 pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), 1464 GFP_KERNEL, hose->node); 1465 pe->tce32_table->data = pe; 1466 1467 /* Put PE to the list */ 1468 mutex_lock(&phb->ioda.pe_list_mutex); 1469 list_add_tail(&pe->list, &phb->ioda.pe_list); 1470 mutex_unlock(&phb->ioda.pe_list_mutex); 1471 1472 pnv_pci_ioda2_setup_dma_pe(phb, pe); 1473 } 1474 1475 if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { 1476 int vf_group; 1477 int vf_per_group; 1478 int vf_index1; 1479 1480 vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; 1481 1482 for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { 1483 for (vf_index = vf_group * vf_per_group; 1484 vf_index < (vf_group + 1) * vf_per_group && 1485 vf_index < num_vfs; 1486 vf_index++) { 1487 for (vf_index1 = vf_group * vf_per_group; 1488 vf_index1 < (vf_group + 1) * vf_per_group && 1489 vf_index1 < num_vfs; 1490 vf_index1++) { 1491 1492 rc = opal_pci_set_peltv(phb->opal_id, 1493 pdn->offset + vf_index, 1494 pdn->offset + vf_index1, 1495 OPAL_ADD_PE_TO_DOMAIN); 1496 1497 if (rc) 1498 dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", 1499 __func__, 1500 pdn->offset + vf_index1, rc); 1501 } 1502 } 1503 } 1504 } 1505} 1506 1507int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) 1508{ 1509 struct pci_bus *bus; 1510 struct pci_controller *hose; 1511 struct pnv_phb *phb; 1512 struct pci_dn *pdn; 1513 int ret; 1514 1515 bus = pdev->bus; 1516 hose = pci_bus_to_host(bus); 1517 phb = hose->private_data; 1518 pdn = pci_get_pdn(pdev); 1519 1520 if (phb->type == PNV_PHB_IODA2) { 1521 /* Calculate available PE for required VFs */ 1522 mutex_lock(&phb->ioda.pe_alloc_mutex); 1523 pdn->offset = bitmap_find_next_zero_area( 1524 phb->ioda.pe_alloc, phb->ioda.total_pe, 1525 0, num_vfs, 0); 1526 if (pdn->offset >= phb->ioda.total_pe) { 1527 mutex_unlock(&phb->ioda.pe_alloc_mutex); 1528 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); 1529 pdn->offset = 0; 1530 return -EBUSY; 1531 } 1532 bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1533 pdn->num_vfs = num_vfs; 1534 mutex_unlock(&phb->ioda.pe_alloc_mutex); 1535 1536 /* Assign M64 window accordingly */ 1537 ret = pnv_pci_vf_assign_m64(pdev, num_vfs); 1538 if (ret) { 1539 dev_info(&pdev->dev, "Not enough M64 window resources\n"); 1540 goto m64_failed; 1541 } 1542 1543 /* 1544 * When using one M64 BAR to map one IOV BAR, we need to shift 1545 * the IOV BAR according to the PE# allocated to the VFs. 1546 * Otherwise, the PE# for the VF will conflict with others. 1547 */ 1548 if (pdn->m64_per_iov == 1) { 1549 ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); 1550 if (ret) 1551 goto m64_failed; 1552 } 1553 } 1554 1555 /* Setup VF PEs */ 1556 pnv_ioda_setup_vf_PE(pdev, num_vfs); 1557 1558 return 0; 1559 1560m64_failed: 1561 bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); 1562 pdn->offset = 0; 1563 1564 return ret; 1565} 1566 1567int pcibios_sriov_disable(struct pci_dev *pdev) 1568{ 1569 pnv_pci_sriov_disable(pdev); 1570 1571 /* Release PCI data */ 1572 remove_dev_pci_data(pdev); 1573 return 0; 1574} 1575 1576int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) 1577{ 1578 /* Allocate PCI data */ 1579 add_dev_pci_data(pdev); 1580 1581 pnv_pci_sriov_enable(pdev, num_vfs); 1582 return 0; 1583} 1584#endif /* CONFIG_PCI_IOV */ 1585 1586static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) 1587{ 1588 struct pci_dn *pdn = pci_get_pdn(pdev); 1589 struct pnv_ioda_pe *pe; 1590 1591 /* 1592 * The function can be called while the PE# 1593 * hasn't been assigned. Do nothing for the 1594 * case. 1595 */ 1596 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 1597 return; 1598 1599 pe = &phb->ioda.pe_array[pdn->pe_number]; 1600 WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); 1601 set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); 1602} 1603 1604static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, 1605 struct pci_dev *pdev, u64 dma_mask) 1606{ 1607 struct pci_dn *pdn = pci_get_pdn(pdev); 1608 struct pnv_ioda_pe *pe; 1609 uint64_t top; 1610 bool bypass = false; 1611 1612 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1613 return -ENODEV;; 1614 1615 pe = &phb->ioda.pe_array[pdn->pe_number]; 1616 if (pe->tce_bypass_enabled) { 1617 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; 1618 bypass = (dma_mask >= top); 1619 } 1620 1621 if (bypass) { 1622 dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); 1623 set_dma_ops(&pdev->dev, &dma_direct_ops); 1624 set_dma_offset(&pdev->dev, pe->tce_bypass_base); 1625 } else { 1626 dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); 1627 set_dma_ops(&pdev->dev, &dma_iommu_ops); 1628 set_iommu_table_base(&pdev->dev, pe->tce32_table); 1629 } 1630 *pdev->dev.dma_mask = dma_mask; 1631 return 0; 1632} 1633 1634static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb, 1635 struct pci_dev *pdev) 1636{ 1637 struct pci_dn *pdn = pci_get_pdn(pdev); 1638 struct pnv_ioda_pe *pe; 1639 u64 end, mask; 1640 1641 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1642 return 0; 1643 1644 pe = &phb->ioda.pe_array[pdn->pe_number]; 1645 if (!pe->tce_bypass_enabled) 1646 return __dma_get_required_mask(&pdev->dev); 1647 1648 1649 end = pe->tce_bypass_base + memblock_end_of_DRAM(); 1650 mask = 1ULL << (fls64(end) - 1); 1651 mask += mask - 1; 1652 1653 return mask; 1654} 1655 1656static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, 1657 struct pci_bus *bus, 1658 bool add_to_iommu_group) 1659{ 1660 struct pci_dev *dev; 1661 1662 list_for_each_entry(dev, &bus->devices, bus_list) { 1663 if (add_to_iommu_group) 1664 set_iommu_table_base_and_group(&dev->dev, 1665 pe->tce32_table); 1666 else 1667 set_iommu_table_base(&dev->dev, pe->tce32_table); 1668 1669 if (dev->subordinate) 1670 pnv_ioda_setup_bus_dma(pe, dev->subordinate, 1671 add_to_iommu_group); 1672 } 1673} 1674 1675static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, 1676 struct iommu_table *tbl, 1677 __be64 *startp, __be64 *endp, bool rm) 1678{ 1679 __be64 __iomem *invalidate = rm ? 1680 (__be64 __iomem *)pe->tce_inval_reg_phys : 1681 (__be64 __iomem *)tbl->it_index; 1682 unsigned long start, end, inc; 1683 const unsigned shift = tbl->it_page_shift; 1684 1685 start = __pa(startp); 1686 end = __pa(endp); 1687 1688 /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ 1689 if (tbl->it_busno) { 1690 start <<= shift; 1691 end <<= shift; 1692 inc = 128ull << shift; 1693 start |= tbl->it_busno; 1694 end |= tbl->it_busno; 1695 } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { 1696 /* p7ioc-style invalidation, 2 TCEs per write */ 1697 start |= (1ull << 63); 1698 end |= (1ull << 63); 1699 inc = 16; 1700 } else { 1701 /* Default (older HW) */ 1702 inc = 128; 1703 } 1704 1705 end |= inc - 1; /* round up end to be different than start */ 1706 1707 mb(); /* Ensure above stores are visible */ 1708 while (start <= end) { 1709 if (rm) 1710 __raw_rm_writeq(cpu_to_be64(start), invalidate); 1711 else 1712 __raw_writeq(cpu_to_be64(start), invalidate); 1713 start += inc; 1714 } 1715 1716 /* 1717 * The iommu layer will do another mb() for us on build() 1718 * and we don't care on free() 1719 */ 1720} 1721 1722static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, 1723 struct iommu_table *tbl, 1724 __be64 *startp, __be64 *endp, bool rm) 1725{ 1726 unsigned long start, end, inc; 1727 __be64 __iomem *invalidate = rm ? 1728 (__be64 __iomem *)pe->tce_inval_reg_phys : 1729 (__be64 __iomem *)tbl->it_index; 1730 const unsigned shift = tbl->it_page_shift; 1731 1732 /* We'll invalidate DMA address in PE scope */ 1733 start = 0x2ull << 60; 1734 start |= (pe->pe_number & 0xFF); 1735 end = start; 1736 1737 /* Figure out the start, end and step */ 1738 inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); 1739 start |= (inc << shift); 1740 inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); 1741 end |= (inc << shift); 1742 inc = (0x1ull << shift); 1743 mb(); 1744 1745 while (start <= end) { 1746 if (rm) 1747 __raw_rm_writeq(cpu_to_be64(start), invalidate); 1748 else 1749 __raw_writeq(cpu_to_be64(start), invalidate); 1750 start += inc; 1751 } 1752} 1753 1754void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, 1755 __be64 *startp, __be64 *endp, bool rm) 1756{ 1757 struct pnv_ioda_pe *pe = tbl->data; 1758 struct pnv_phb *phb = pe->phb; 1759 1760 if (phb->type == PNV_PHB_IODA1) 1761 pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); 1762 else 1763 pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); 1764} 1765 1766static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, 1767 struct pnv_ioda_pe *pe, unsigned int base, 1768 unsigned int segs) 1769{ 1770 1771 struct page *tce_mem = NULL; 1772 const __be64 *swinvp; 1773 struct iommu_table *tbl; 1774 unsigned int i; 1775 int64_t rc; 1776 void *addr; 1777 1778 /* XXX FIXME: Handle 64-bit only DMA devices */ 1779 /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ 1780 /* XXX FIXME: Allocate multi-level tables on PHB3 */ 1781 1782 /* We shouldn't already have a 32-bit DMA associated */ 1783 if (WARN_ON(pe->tce32_seg >= 0)) 1784 return; 1785 1786 /* Grab a 32-bit TCE table */ 1787 pe->tce32_seg = base; 1788 pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", 1789 (base << 28), ((base + segs) << 28) - 1); 1790 1791 /* XXX Currently, we allocate one big contiguous table for the 1792 * TCEs. We only really need one chunk per 256M of TCE space 1793 * (ie per segment) but that's an optimization for later, it 1794 * requires some added smarts with our get/put_tce implementation 1795 */ 1796 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1797 get_order(TCE32_TABLE_SIZE * segs)); 1798 if (!tce_mem) { 1799 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); 1800 goto fail; 1801 } 1802 addr = page_address(tce_mem); 1803 memset(addr, 0, TCE32_TABLE_SIZE * segs); 1804 1805 /* Configure HW */ 1806 for (i = 0; i < segs; i++) { 1807 rc = opal_pci_map_pe_dma_window(phb->opal_id, 1808 pe->pe_number, 1809 base + i, 1, 1810 __pa(addr) + TCE32_TABLE_SIZE * i, 1811 TCE32_TABLE_SIZE, 0x1000); 1812 if (rc) { 1813 pe_err(pe, " Failed to configure 32-bit TCE table," 1814 " err %ld\n", rc); 1815 goto fail; 1816 } 1817 } 1818 1819 /* Setup linux iommu table */ 1820 tbl = pe->tce32_table; 1821 pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, 1822 base << 28, IOMMU_PAGE_SHIFT_4K); 1823 1824 /* OPAL variant of P7IOC SW invalidated TCEs */ 1825 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 1826 if (swinvp) { 1827 /* We need a couple more fields -- an address and a data 1828 * to or. Since the bus is only printed out on table free 1829 * errors, and on the first pass the data will be a relative 1830 * bus number, print that out instead. 1831 */ 1832 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 1833 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 1834 8); 1835 tbl->it_type |= (TCE_PCI_SWINV_CREATE | 1836 TCE_PCI_SWINV_FREE | 1837 TCE_PCI_SWINV_PAIR); 1838 } 1839 iommu_init_table(tbl, phb->hose->node); 1840 1841 if (pe->flags & PNV_IODA_PE_DEV) { 1842 iommu_register_group(tbl, phb->hose->global_number, 1843 pe->pe_number); 1844 set_iommu_table_base_and_group(&pe->pdev->dev, tbl); 1845 } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { 1846 iommu_register_group(tbl, phb->hose->global_number, 1847 pe->pe_number); 1848 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 1849 } else if (pe->flags & PNV_IODA_PE_VF) { 1850 iommu_register_group(tbl, phb->hose->global_number, 1851 pe->pe_number); 1852 } 1853 1854 return; 1855 fail: 1856 /* XXX Failure: Try to fallback to 64-bit only ? */ 1857 if (pe->tce32_seg >= 0) 1858 pe->tce32_seg = -1; 1859 if (tce_mem) 1860 __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); 1861} 1862 1863static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) 1864{ 1865 struct pnv_ioda_pe *pe = tbl->data; 1866 uint16_t window_id = (pe->pe_number << 1 ) + 1; 1867 int64_t rc; 1868 1869 pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); 1870 if (enable) { 1871 phys_addr_t top = memblock_end_of_DRAM(); 1872 1873 top = roundup_pow_of_two(top); 1874 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1875 pe->pe_number, 1876 window_id, 1877 pe->tce_bypass_base, 1878 top); 1879 } else { 1880 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, 1881 pe->pe_number, 1882 window_id, 1883 pe->tce_bypass_base, 1884 0); 1885 1886 /* 1887 * EEH needs the mapping between IOMMU table and group 1888 * of those VFIO/KVM pass-through devices. We can postpone 1889 * resetting DMA ops until the DMA mask is configured in 1890 * host side. 1891 */ 1892 if (pe->pdev) 1893 set_iommu_table_base(&pe->pdev->dev, tbl); 1894 else 1895 pnv_ioda_setup_bus_dma(pe, pe->pbus, false); 1896 } 1897 if (rc) 1898 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); 1899 else 1900 pe->tce_bypass_enabled = enable; 1901} 1902 1903static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, 1904 struct pnv_ioda_pe *pe) 1905{ 1906 /* TVE #1 is selected by PCI address bit 59 */ 1907 pe->tce_bypass_base = 1ull << 59; 1908 1909 /* Install set_bypass callback for VFIO */ 1910 pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; 1911 1912 /* Enable bypass by default */ 1913 pnv_pci_ioda2_set_bypass(pe->tce32_table, true); 1914} 1915 1916static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 1917 struct pnv_ioda_pe *pe) 1918{ 1919 struct page *tce_mem = NULL; 1920 void *addr; 1921 const __be64 *swinvp; 1922 struct iommu_table *tbl; 1923 unsigned int tce_table_size, end; 1924 int64_t rc; 1925 1926 /* We shouldn't already have a 32-bit DMA associated */ 1927 if (WARN_ON(pe->tce32_seg >= 0)) 1928 return; 1929 1930 /* The PE will reserve all possible 32-bits space */ 1931 pe->tce32_seg = 0; 1932 end = (1 << ilog2(phb->ioda.m32_pci_base)); 1933 tce_table_size = (end / 0x1000) * 8; 1934 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", 1935 end); 1936 1937 /* Allocate TCE table */ 1938 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 1939 get_order(tce_table_size)); 1940 if (!tce_mem) { 1941 pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); 1942 goto fail; 1943 } 1944 addr = page_address(tce_mem); 1945 memset(addr, 0, tce_table_size); 1946 1947 /* 1948 * Map TCE table through TVT. The TVE index is the PE number 1949 * shifted by 1 bit for 32-bits DMA space. 1950 */ 1951 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, 1952 pe->pe_number << 1, 1, __pa(addr), 1953 tce_table_size, 0x1000); 1954 if (rc) { 1955 pe_err(pe, "Failed to configure 32-bit TCE table," 1956 " err %ld\n", rc); 1957 goto fail; 1958 } 1959 1960 /* Setup linux iommu table */ 1961 tbl = pe->tce32_table; 1962 pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, 1963 IOMMU_PAGE_SHIFT_4K); 1964 1965 /* OPAL variant of PHB3 invalidated TCEs */ 1966 swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); 1967 if (swinvp) { 1968 /* We need a couple more fields -- an address and a data 1969 * to or. Since the bus is only printed out on table free 1970 * errors, and on the first pass the data will be a relative 1971 * bus number, print that out instead. 1972 */ 1973 pe->tce_inval_reg_phys = be64_to_cpup(swinvp); 1974 tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 1975 8); 1976 tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); 1977 } 1978 iommu_init_table(tbl, phb->hose->node); 1979 1980 if (pe->flags & PNV_IODA_PE_DEV) { 1981 iommu_register_group(tbl, phb->hose->global_number, 1982 pe->pe_number); 1983 set_iommu_table_base_and_group(&pe->pdev->dev, tbl); 1984 } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { 1985 iommu_register_group(tbl, phb->hose->global_number, 1986 pe->pe_number); 1987 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 1988 } else if (pe->flags & PNV_IODA_PE_VF) { 1989 iommu_register_group(tbl, phb->hose->global_number, 1990 pe->pe_number); 1991 } 1992 1993 /* Also create a bypass window */ 1994 if (!pnv_iommu_bypass_disabled) 1995 pnv_pci_ioda2_setup_bypass_pe(phb, pe); 1996 1997 return; 1998fail: 1999 if (pe->tce32_seg >= 0) 2000 pe->tce32_seg = -1; 2001 if (tce_mem) 2002 __free_pages(tce_mem, get_order(tce_table_size)); 2003} 2004 2005static void pnv_ioda_setup_dma(struct pnv_phb *phb) 2006{ 2007 struct pci_controller *hose = phb->hose; 2008 unsigned int residual, remaining, segs, tw, base; 2009 struct pnv_ioda_pe *pe; 2010 2011 /* If we have more PE# than segments available, hand out one 2012 * per PE until we run out and let the rest fail. If not, 2013 * then we assign at least one segment per PE, plus more based 2014 * on the amount of devices under that PE 2015 */ 2016 if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) 2017 residual = 0; 2018 else 2019 residual = phb->ioda.tce32_count - 2020 phb->ioda.dma_pe_count; 2021 2022 pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", 2023 hose->global_number, phb->ioda.tce32_count); 2024 pr_info("PCI: %d PE# for a total weight of %d\n", 2025 phb->ioda.dma_pe_count, phb->ioda.dma_weight); 2026 2027 /* Walk our PE list and configure their DMA segments, hand them 2028 * out one base segment plus any residual segments based on 2029 * weight 2030 */ 2031 remaining = phb->ioda.tce32_count; 2032 tw = phb->ioda.dma_weight; 2033 base = 0; 2034 list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 2035 if (!pe->dma_weight) 2036 continue; 2037 if (!remaining) { 2038 pe_warn(pe, "No DMA32 resources available\n"); 2039 continue; 2040 } 2041 segs = 1; 2042 if (residual) { 2043 segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; 2044 if (segs > remaining) 2045 segs = remaining; 2046 } 2047 2048 /* 2049 * For IODA2 compliant PHB3, we needn't care about the weight. 2050 * The all available 32-bits DMA space will be assigned to 2051 * the specific PE. 2052 */ 2053 if (phb->type == PNV_PHB_IODA1) { 2054 pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", 2055 pe->dma_weight, segs); 2056 pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); 2057 } else { 2058 pe_info(pe, "Assign DMA32 space\n"); 2059 segs = 0; 2060 pnv_pci_ioda2_setup_dma_pe(phb, pe); 2061 } 2062 2063 remaining -= segs; 2064 base += segs; 2065 } 2066} 2067 2068#ifdef CONFIG_PCI_MSI 2069static void pnv_ioda2_msi_eoi(struct irq_data *d) 2070{ 2071 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 2072 struct irq_chip *chip = irq_data_get_irq_chip(d); 2073 struct pnv_phb *phb = container_of(chip, struct pnv_phb, 2074 ioda.irq_chip); 2075 int64_t rc; 2076 2077 rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); 2078 WARN_ON_ONCE(rc); 2079 2080 icp_native_eoi(d); 2081} 2082 2083 2084static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) 2085{ 2086 struct irq_data *idata; 2087 struct irq_chip *ichip; 2088 2089 if (phb->type != PNV_PHB_IODA2) 2090 return; 2091 2092 if (!phb->ioda.irq_chip_init) { 2093 /* 2094 * First time we setup an MSI IRQ, we need to setup the 2095 * corresponding IRQ chip to route correctly. 2096 */ 2097 idata = irq_get_irq_data(virq); 2098 ichip = irq_data_get_irq_chip(idata); 2099 phb->ioda.irq_chip_init = 1; 2100 phb->ioda.irq_chip = *ichip; 2101 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; 2102 } 2103 irq_set_chip(virq, &phb->ioda.irq_chip); 2104} 2105 2106#ifdef CONFIG_CXL_BASE 2107 2108struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) 2109{ 2110 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2111 2112 return of_node_get(hose->dn); 2113} 2114EXPORT_SYMBOL(pnv_pci_get_phb_node); 2115 2116int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) 2117{ 2118 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2119 struct pnv_phb *phb = hose->private_data; 2120 struct pnv_ioda_pe *pe; 2121 int rc; 2122 2123 pe = pnv_ioda_get_pe(dev); 2124 if (!pe) 2125 return -ENODEV; 2126 2127 pe_info(pe, "Switching PHB to CXL\n"); 2128 2129 rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); 2130 if (rc) 2131 dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); 2132 2133 return rc; 2134} 2135EXPORT_SYMBOL(pnv_phb_to_cxl_mode); 2136 2137/* Find PHB for cxl dev and allocate MSI hwirqs? 2138 * Returns the absolute hardware IRQ number 2139 */ 2140int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) 2141{ 2142 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2143 struct pnv_phb *phb = hose->private_data; 2144 int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); 2145 2146 if (hwirq < 0) { 2147 dev_warn(&dev->dev, "Failed to find a free MSI\n"); 2148 return -ENOSPC; 2149 } 2150 2151 return phb->msi_base + hwirq; 2152} 2153EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); 2154 2155void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) 2156{ 2157 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2158 struct pnv_phb *phb = hose->private_data; 2159 2160 msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); 2161} 2162EXPORT_SYMBOL(pnv_cxl_release_hwirqs); 2163 2164void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, 2165 struct pci_dev *dev) 2166{ 2167 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2168 struct pnv_phb *phb = hose->private_data; 2169 int i, hwirq; 2170 2171 for (i = 1; i < CXL_IRQ_RANGES; i++) { 2172 if (!irqs->range[i]) 2173 continue; 2174 pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", 2175 i, irqs->offset[i], 2176 irqs->range[i]); 2177 hwirq = irqs->offset[i] - phb->msi_base; 2178 msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 2179 irqs->range[i]); 2180 } 2181} 2182EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); 2183 2184int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, 2185 struct pci_dev *dev, int num) 2186{ 2187 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2188 struct pnv_phb *phb = hose->private_data; 2189 int i, hwirq, try; 2190 2191 memset(irqs, 0, sizeof(struct cxl_irq_ranges)); 2192 2193 /* 0 is reserved for the multiplexed PSL DSI interrupt */ 2194 for (i = 1; i < CXL_IRQ_RANGES && num; i++) { 2195 try = num; 2196 while (try) { 2197 hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); 2198 if (hwirq >= 0) 2199 break; 2200 try /= 2; 2201 } 2202 if (!try) 2203 goto fail; 2204 2205 irqs->offset[i] = phb->msi_base + hwirq; 2206 irqs->range[i] = try; 2207 pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", 2208 i, irqs->offset[i], irqs->range[i]); 2209 num -= try; 2210 } 2211 if (num) 2212 goto fail; 2213 2214 return 0; 2215fail: 2216 pnv_cxl_release_hwirq_ranges(irqs, dev); 2217 return -ENOSPC; 2218} 2219EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); 2220 2221int pnv_cxl_get_irq_count(struct pci_dev *dev) 2222{ 2223 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2224 struct pnv_phb *phb = hose->private_data; 2225 2226 return phb->msi_bmp.irq_count; 2227} 2228EXPORT_SYMBOL(pnv_cxl_get_irq_count); 2229 2230int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, 2231 unsigned int virq) 2232{ 2233 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2234 struct pnv_phb *phb = hose->private_data; 2235 unsigned int xive_num = hwirq - phb->msi_base; 2236 struct pnv_ioda_pe *pe; 2237 int rc; 2238 2239 if (!(pe = pnv_ioda_get_pe(dev))) 2240 return -ENODEV; 2241 2242 /* Assign XIVE to PE */ 2243 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 2244 if (rc) { 2245 pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " 2246 "hwirq 0x%x XIVE 0x%x PE\n", 2247 pci_name(dev), rc, phb->msi_base, hwirq, xive_num); 2248 return -EIO; 2249 } 2250 set_msi_irq_chip(phb, virq); 2251 2252 return 0; 2253} 2254EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); 2255#endif 2256 2257static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, 2258 unsigned int hwirq, unsigned int virq, 2259 unsigned int is_64, struct msi_msg *msg) 2260{ 2261 struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); 2262 unsigned int xive_num = hwirq - phb->msi_base; 2263 __be32 data; 2264 int rc; 2265 2266 /* No PE assigned ? bail out ... no MSI for you ! */ 2267 if (pe == NULL) 2268 return -ENXIO; 2269 2270 /* Check if we have an MVE */ 2271 if (pe->mve_number < 0) 2272 return -ENXIO; 2273 2274 /* Force 32-bit MSI on some broken devices */ 2275 if (dev->no_64bit_msi) 2276 is_64 = 0; 2277 2278 /* Assign XIVE to PE */ 2279 rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); 2280 if (rc) { 2281 pr_warn("%s: OPAL error %d setting XIVE %d PE\n", 2282 pci_name(dev), rc, xive_num); 2283 return -EIO; 2284 } 2285 2286 if (is_64) { 2287 __be64 addr64; 2288 2289 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, 2290 &addr64, &data); 2291 if (rc) { 2292 pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", 2293 pci_name(dev), rc); 2294 return -EIO; 2295 } 2296 msg->address_hi = be64_to_cpu(addr64) >> 32; 2297 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; 2298 } else { 2299 __be32 addr32; 2300 2301 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, 2302 &addr32, &data); 2303 if (rc) { 2304 pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", 2305 pci_name(dev), rc); 2306 return -EIO; 2307 } 2308 msg->address_hi = 0; 2309 msg->address_lo = be32_to_cpu(addr32); 2310 } 2311 msg->data = be32_to_cpu(data); 2312 2313 set_msi_irq_chip(phb, virq); 2314 2315 pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," 2316 " address=%x_%08x data=%x PE# %d\n", 2317 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, 2318 msg->address_hi, msg->address_lo, data, pe->pe_number); 2319 2320 return 0; 2321} 2322 2323static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) 2324{ 2325 unsigned int count; 2326 const __be32 *prop = of_get_property(phb->hose->dn, 2327 "ibm,opal-msi-ranges", NULL); 2328 if (!prop) { 2329 /* BML Fallback */ 2330 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); 2331 } 2332 if (!prop) 2333 return; 2334 2335 phb->msi_base = be32_to_cpup(prop); 2336 count = be32_to_cpup(prop + 1); 2337 if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { 2338 pr_err("PCI %d: Failed to allocate MSI bitmap !\n", 2339 phb->hose->global_number); 2340 return; 2341 } 2342 2343 phb->msi_setup = pnv_pci_ioda_msi_setup; 2344 phb->msi32_support = 1; 2345 pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", 2346 count, phb->msi_base); 2347} 2348#else 2349static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } 2350#endif /* CONFIG_PCI_MSI */ 2351 2352#ifdef CONFIG_PCI_IOV 2353static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) 2354{ 2355 struct pci_controller *hose; 2356 struct pnv_phb *phb; 2357 struct resource *res; 2358 int i; 2359 resource_size_t size; 2360 struct pci_dn *pdn; 2361 int mul, total_vfs; 2362 2363 if (!pdev->is_physfn || pdev->is_added) 2364 return; 2365 2366 hose = pci_bus_to_host(pdev->bus); 2367 phb = hose->private_data; 2368 2369 pdn = pci_get_pdn(pdev); 2370 pdn->vfs_expanded = 0; 2371 2372 total_vfs = pci_sriov_get_totalvfs(pdev); 2373 pdn->m64_per_iov = 1; 2374 mul = phb->ioda.total_pe; 2375 2376 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 2377 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 2378 if (!res->flags || res->parent) 2379 continue; 2380 if (!pnv_pci_is_mem_pref_64(res->flags)) { 2381 dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", 2382 i, res); 2383 continue; 2384 } 2385 2386 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); 2387 2388 /* bigger than 64M */ 2389 if (size > (1 << 26)) { 2390 dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", 2391 i, res); 2392 pdn->m64_per_iov = M64_PER_IOV; 2393 mul = roundup_pow_of_two(total_vfs); 2394 break; 2395 } 2396 } 2397 2398 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { 2399 res = &pdev->resource[i + PCI_IOV_RESOURCES]; 2400 if (!res->flags || res->parent) 2401 continue; 2402 if (!pnv_pci_is_mem_pref_64(res->flags)) { 2403 dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", 2404 i, res); 2405 continue; 2406 } 2407 2408 dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); 2409 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); 2410 res->end = res->start + size * mul - 1; 2411 dev_dbg(&pdev->dev, " %pR\n", res); 2412 dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", 2413 i, res, mul); 2414 } 2415 pdn->vfs_expanded = mul; 2416} 2417#endif /* CONFIG_PCI_IOV */ 2418 2419/* 2420 * This function is supposed to be called on basis of PE from top 2421 * to bottom style. So the the I/O or MMIO segment assigned to 2422 * parent PE could be overrided by its child PEs if necessary. 2423 */ 2424static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, 2425 struct pnv_ioda_pe *pe) 2426{ 2427 struct pnv_phb *phb = hose->private_data; 2428 struct pci_bus_region region; 2429 struct resource *res; 2430 int i, index; 2431 int rc; 2432 2433 /* 2434 * NOTE: We only care PCI bus based PE for now. For PCI 2435 * device based PE, for example SRIOV sensitive VF should 2436 * be figured out later. 2437 */ 2438 BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); 2439 2440 pci_bus_for_each_resource(pe->pbus, res, i) { 2441 if (!res || !res->flags || 2442 res->start > res->end) 2443 continue; 2444 2445 if (res->flags & IORESOURCE_IO) { 2446 region.start = res->start - phb->ioda.io_pci_base; 2447 region.end = res->end - phb->ioda.io_pci_base; 2448 index = region.start / phb->ioda.io_segsize; 2449 2450 while (index < phb->ioda.total_pe && 2451 region.start <= region.end) { 2452 phb->ioda.io_segmap[index] = pe->pe_number; 2453 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2454 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 2455 if (rc != OPAL_SUCCESS) { 2456 pr_err("%s: OPAL error %d when mapping IO " 2457 "segment #%d to PE#%d\n", 2458 __func__, rc, index, pe->pe_number); 2459 break; 2460 } 2461 2462 region.start += phb->ioda.io_segsize; 2463 index++; 2464 } 2465 } else if ((res->flags & IORESOURCE_MEM) && 2466 !pnv_pci_is_mem_pref_64(res->flags)) { 2467 region.start = res->start - 2468 hose->mem_offset[0] - 2469 phb->ioda.m32_pci_base; 2470 region.end = res->end - 2471 hose->mem_offset[0] - 2472 phb->ioda.m32_pci_base; 2473 index = region.start / phb->ioda.m32_segsize; 2474 2475 while (index < phb->ioda.total_pe && 2476 region.start <= region.end) { 2477 phb->ioda.m32_segmap[index] = pe->pe_number; 2478 rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2479 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 2480 if (rc != OPAL_SUCCESS) { 2481 pr_err("%s: OPAL error %d when mapping M32 " 2482 "segment#%d to PE#%d", 2483 __func__, rc, index, pe->pe_number); 2484 break; 2485 } 2486 2487 region.start += phb->ioda.m32_segsize; 2488 index++; 2489 } 2490 } 2491 } 2492} 2493 2494static void pnv_pci_ioda_setup_seg(void) 2495{ 2496 struct pci_controller *tmp, *hose; 2497 struct pnv_phb *phb; 2498 struct pnv_ioda_pe *pe; 2499 2500 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2501 phb = hose->private_data; 2502 list_for_each_entry(pe, &phb->ioda.pe_list, list) { 2503 pnv_ioda_setup_pe_seg(hose, pe); 2504 } 2505 } 2506} 2507 2508static void pnv_pci_ioda_setup_DMA(void) 2509{ 2510 struct pci_controller *hose, *tmp; 2511 struct pnv_phb *phb; 2512 2513 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2514 pnv_ioda_setup_dma(hose->private_data); 2515 2516 /* Mark the PHB initialization done */ 2517 phb = hose->private_data; 2518 phb->initialized = 1; 2519 } 2520} 2521 2522static void pnv_pci_ioda_create_dbgfs(void) 2523{ 2524#ifdef CONFIG_DEBUG_FS 2525 struct pci_controller *hose, *tmp; 2526 struct pnv_phb *phb; 2527 char name[16]; 2528 2529 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2530 phb = hose->private_data; 2531 2532 sprintf(name, "PCI%04x", hose->global_number); 2533 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); 2534 if (!phb->dbgfs) 2535 pr_warning("%s: Error on creating debugfs on PHB#%x\n", 2536 __func__, hose->global_number); 2537 } 2538#endif /* CONFIG_DEBUG_FS */ 2539} 2540 2541static void pnv_pci_ioda_fixup(void) 2542{ 2543 pnv_pci_ioda_setup_PEs(); 2544 pnv_pci_ioda_setup_seg(); 2545 pnv_pci_ioda_setup_DMA(); 2546 2547 pnv_pci_ioda_create_dbgfs(); 2548 2549#ifdef CONFIG_EEH 2550 eeh_init(); 2551 eeh_addr_cache_build(); 2552#endif 2553} 2554 2555/* 2556 * Returns the alignment for I/O or memory windows for P2P 2557 * bridges. That actually depends on how PEs are segmented. 2558 * For now, we return I/O or M32 segment size for PE sensitive 2559 * P2P bridges. Otherwise, the default values (4KiB for I/O, 2560 * 1MiB for memory) will be returned. 2561 * 2562 * The current PCI bus might be put into one PE, which was 2563 * create against the parent PCI bridge. For that case, we 2564 * needn't enlarge the alignment so that we can save some 2565 * resources. 2566 */ 2567static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, 2568 unsigned long type) 2569{ 2570 struct pci_dev *bridge; 2571 struct pci_controller *hose = pci_bus_to_host(bus); 2572 struct pnv_phb *phb = hose->private_data; 2573 int num_pci_bridges = 0; 2574 2575 bridge = bus->self; 2576 while (bridge) { 2577 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { 2578 num_pci_bridges++; 2579 if (num_pci_bridges >= 2) 2580 return 1; 2581 } 2582 2583 bridge = bridge->bus->self; 2584 } 2585 2586 /* We fail back to M32 if M64 isn't supported */ 2587 if (phb->ioda.m64_segsize && 2588 pnv_pci_is_mem_pref_64(type)) 2589 return phb->ioda.m64_segsize; 2590 if (type & IORESOURCE_MEM) 2591 return phb->ioda.m32_segsize; 2592 2593 return phb->ioda.io_segsize; 2594} 2595 2596#ifdef CONFIG_PCI_IOV 2597static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, 2598 int resno) 2599{ 2600 struct pci_dn *pdn = pci_get_pdn(pdev); 2601 resource_size_t align, iov_align; 2602 2603 iov_align = resource_size(&pdev->resource[resno]); 2604 if (iov_align) 2605 return iov_align; 2606 2607 align = pci_iov_resource_size(pdev, resno); 2608 if (pdn->vfs_expanded) 2609 return pdn->vfs_expanded * align; 2610 2611 return align; 2612} 2613#endif /* CONFIG_PCI_IOV */ 2614 2615/* Prevent enabling devices for which we couldn't properly 2616 * assign a PE 2617 */ 2618static bool pnv_pci_enable_device_hook(struct pci_dev *dev) 2619{ 2620 struct pci_controller *hose = pci_bus_to_host(dev->bus); 2621 struct pnv_phb *phb = hose->private_data; 2622 struct pci_dn *pdn; 2623 2624 /* The function is probably called while the PEs have 2625 * not be created yet. For example, resource reassignment 2626 * during PCI probe period. We just skip the check if 2627 * PEs isn't ready. 2628 */ 2629 if (!phb->initialized) 2630 return true; 2631 2632 pdn = pci_get_pdn(dev); 2633 if (!pdn || pdn->pe_number == IODA_INVALID_PE) 2634 return false; 2635 2636 return true; 2637} 2638 2639static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, 2640 u32 devfn) 2641{ 2642 return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; 2643} 2644 2645static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) 2646{ 2647 opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE, 2648 OPAL_ASSERT_RESET); 2649} 2650 2651static void __init pnv_pci_init_ioda_phb(struct device_node *np, 2652 u64 hub_id, int ioda_type) 2653{ 2654 struct pci_controller *hose; 2655 struct pnv_phb *phb; 2656 unsigned long size, m32map_off, pemap_off, iomap_off = 0; 2657 const __be64 *prop64; 2658 const __be32 *prop32; 2659 int len; 2660 u64 phb_id; 2661 void *aux; 2662 long rc; 2663 2664 pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); 2665 2666 prop64 = of_get_property(np, "ibm,opal-phbid", NULL); 2667 if (!prop64) { 2668 pr_err(" Missing \"ibm,opal-phbid\" property !\n"); 2669 return; 2670 } 2671 phb_id = be64_to_cpup(prop64); 2672 pr_debug(" PHB-ID : 0x%016llx\n", phb_id); 2673 2674 phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0); 2675 2676 /* Allocate PCI controller */ 2677 phb->hose = hose = pcibios_alloc_controller(np); 2678 if (!phb->hose) { 2679 pr_err(" Can't allocate PCI controller for %s\n", 2680 np->full_name); 2681 memblock_free(__pa(phb), sizeof(struct pnv_phb)); 2682 return; 2683 } 2684 2685 spin_lock_init(&phb->lock); 2686 prop32 = of_get_property(np, "bus-range", &len); 2687 if (prop32 && len == 8) { 2688 hose->first_busno = be32_to_cpu(prop32[0]); 2689 hose->last_busno = be32_to_cpu(prop32[1]); 2690 } else { 2691 pr_warn(" Broken <bus-range> on %s\n", np->full_name); 2692 hose->first_busno = 0; 2693 hose->last_busno = 0xff; 2694 } 2695 hose->private_data = phb; 2696 phb->hub_id = hub_id; 2697 phb->opal_id = phb_id; 2698 phb->type = ioda_type; 2699 mutex_init(&phb->ioda.pe_alloc_mutex); 2700 2701 /* Detect specific models for error handling */ 2702 if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) 2703 phb->model = PNV_PHB_MODEL_P7IOC; 2704 else if (of_device_is_compatible(np, "ibm,power8-pciex")) 2705 phb->model = PNV_PHB_MODEL_PHB3; 2706 else 2707 phb->model = PNV_PHB_MODEL_UNKNOWN; 2708 2709 /* Parse 32-bit and IO ranges (if any) */ 2710 pci_process_bridge_OF_ranges(hose, np, !hose->global_number); 2711 2712 /* Get registers */ 2713 phb->regs = of_iomap(np, 0); 2714 if (phb->regs == NULL) 2715 pr_err(" Failed to map registers !\n"); 2716 2717 /* Initialize more IODA stuff */ 2718 phb->ioda.total_pe = 1; 2719 prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); 2720 if (prop32) 2721 phb->ioda.total_pe = be32_to_cpup(prop32); 2722 prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); 2723 if (prop32) 2724 phb->ioda.reserved_pe = be32_to_cpup(prop32); 2725 2726 /* Parse 64-bit MMIO range */ 2727 pnv_ioda_parse_m64_window(phb); 2728 2729 phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); 2730 /* FW Has already off top 64k of M32 space (MSI space) */ 2731 phb->ioda.m32_size += 0x10000; 2732 2733 phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; 2734 phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; 2735 phb->ioda.io_size = hose->pci_io_size; 2736 phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; 2737 phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ 2738 2739 /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ 2740 size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 2741 m32map_off = size; 2742 size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); 2743 if (phb->type == PNV_PHB_IODA1) { 2744 iomap_off = size; 2745 size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); 2746 } 2747 pemap_off = size; 2748 size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); 2749 aux = memblock_virt_alloc(size, 0); 2750 phb->ioda.pe_alloc = aux; 2751 phb->ioda.m32_segmap = aux + m32map_off; 2752 if (phb->type == PNV_PHB_IODA1) 2753 phb->ioda.io_segmap = aux + iomap_off; 2754 phb->ioda.pe_array = aux + pemap_off; 2755 set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); 2756 2757 INIT_LIST_HEAD(&phb->ioda.pe_dma_list); 2758 INIT_LIST_HEAD(&phb->ioda.pe_list); 2759 mutex_init(&phb->ioda.pe_list_mutex); 2760 2761 /* Calculate how many 32-bit TCE segments we have */ 2762 phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; 2763 2764#if 0 /* We should really do that ... */ 2765 rc = opal_pci_set_phb_mem_window(opal->phb_id, 2766 window_type, 2767 window_num, 2768 starting_real_address, 2769 starting_pci_address, 2770 segment_size); 2771#endif 2772 2773 pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", 2774 phb->ioda.total_pe, phb->ioda.reserved_pe, 2775 phb->ioda.m32_size, phb->ioda.m32_segsize); 2776 if (phb->ioda.m64_size) 2777 pr_info(" M64: 0x%lx [segment=0x%lx]\n", 2778 phb->ioda.m64_size, phb->ioda.m64_segsize); 2779 if (phb->ioda.io_size) 2780 pr_info(" IO: 0x%x [segment=0x%x]\n", 2781 phb->ioda.io_size, phb->ioda.io_segsize); 2782 2783 2784 phb->hose->ops = &pnv_pci_ops; 2785 phb->get_pe_state = pnv_ioda_get_pe_state; 2786 phb->freeze_pe = pnv_ioda_freeze_pe; 2787 phb->unfreeze_pe = pnv_ioda_unfreeze_pe; 2788 2789 /* Setup RID -> PE mapping function */ 2790 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 2791 2792 /* Setup TCEs */ 2793 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 2794 phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; 2795 phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; 2796 2797 /* Setup shutdown function for kexec */ 2798 phb->shutdown = pnv_pci_ioda_shutdown; 2799 2800 /* Setup MSI support */ 2801 pnv_pci_init_ioda_msis(phb); 2802 2803 /* 2804 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here 2805 * to let the PCI core do resource assignment. It's supposed 2806 * that the PCI core will do correct I/O and MMIO alignment 2807 * for the P2P bridge bars so that each PCI bus (excluding 2808 * the child P2P bridges) can form individual PE. 2809 */ 2810 ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; 2811 pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook; 2812 pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; 2813 pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; 2814 hose->controller_ops = pnv_pci_controller_ops; 2815 2816#ifdef CONFIG_PCI_IOV 2817 ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; 2818 ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; 2819#endif 2820 2821 pci_add_flags(PCI_REASSIGN_ALL_RSRC); 2822 2823 /* Reset IODA tables to a clean state */ 2824 rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); 2825 if (rc) 2826 pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); 2827 2828 /* If we're running in kdump kerenl, the previous kerenl never 2829 * shutdown PCI devices correctly. We already got IODA table 2830 * cleaned out. So we have to issue PHB reset to stop all PCI 2831 * transactions from previous kerenl. 2832 */ 2833 if (is_kdump_kernel()) { 2834 pr_info(" Issue PHB reset ...\n"); 2835 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); 2836 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); 2837 } 2838 2839 /* Remove M64 resource if we can't configure it successfully */ 2840 if (!phb->init_m64 || phb->init_m64(phb)) 2841 hose->mem_resources[1].flags = 0; 2842} 2843 2844void __init pnv_pci_init_ioda2_phb(struct device_node *np) 2845{ 2846 pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); 2847} 2848 2849void __init pnv_pci_init_ioda_hub(struct device_node *np) 2850{ 2851 struct device_node *phbn; 2852 const __be64 *prop64; 2853 u64 hub_id; 2854 2855 pr_info("Probing IODA IO-Hub %s\n", np->full_name); 2856 2857 prop64 = of_get_property(np, "ibm,opal-hubid", NULL); 2858 if (!prop64) { 2859 pr_err(" Missing \"ibm,opal-hubid\" property !\n"); 2860 return; 2861 } 2862 hub_id = be64_to_cpup(prop64); 2863 pr_devel(" HUB-ID : 0x%016llx\n", hub_id); 2864 2865 /* Count child PHBs */ 2866 for_each_child_of_node(np, phbn) { 2867 /* Look for IODA1 PHBs */ 2868 if (of_device_is_compatible(phbn, "ibm,ioda-phb")) 2869 pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); 2870 } 2871} 2872