1/*
2 * Support PCI/PCIe on PowerNV platforms
3 *
4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#undef DEBUG
13
14#include <linux/kernel.h>
15#include <linux/pci.h>
16#include <linux/crash_dump.h>
17#include <linux/debugfs.h>
18#include <linux/delay.h>
19#include <linux/string.h>
20#include <linux/init.h>
21#include <linux/bootmem.h>
22#include <linux/irq.h>
23#include <linux/io.h>
24#include <linux/msi.h>
25#include <linux/memblock.h>
26
27#include <asm/sections.h>
28#include <asm/io.h>
29#include <asm/prom.h>
30#include <asm/pci-bridge.h>
31#include <asm/machdep.h>
32#include <asm/msi_bitmap.h>
33#include <asm/ppc-pci.h>
34#include <asm/opal.h>
35#include <asm/iommu.h>
36#include <asm/tce.h>
37#include <asm/xics.h>
38#include <asm/debug.h>
39#include <asm/firmware.h>
40#include <asm/pnv-pci.h>
41
42#include <misc/cxl.h>
43
44#include "powernv.h"
45#include "pci.h"
46
47/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
48#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
49
50static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
51			    const char *fmt, ...)
52{
53	struct va_format vaf;
54	va_list args;
55	char pfix[32];
56
57	va_start(args, fmt);
58
59	vaf.fmt = fmt;
60	vaf.va = &args;
61
62	if (pe->flags & PNV_IODA_PE_DEV)
63		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
64	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
65		sprintf(pfix, "%04x:%02x     ",
66			pci_domain_nr(pe->pbus), pe->pbus->number);
67#ifdef CONFIG_PCI_IOV
68	else if (pe->flags & PNV_IODA_PE_VF)
69		sprintf(pfix, "%04x:%02x:%2x.%d",
70			pci_domain_nr(pe->parent_dev->bus),
71			(pe->rid & 0xff00) >> 8,
72			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
73#endif /* CONFIG_PCI_IOV*/
74
75	printk("%spci %s: [PE# %.3d] %pV",
76	       level, pfix, pe->pe_number, &vaf);
77
78	va_end(args);
79}
80
81#define pe_err(pe, fmt, ...)					\
82	pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
83#define pe_warn(pe, fmt, ...)					\
84	pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
85#define pe_info(pe, fmt, ...)					\
86	pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
87
88static bool pnv_iommu_bypass_disabled __read_mostly;
89
90static int __init iommu_setup(char *str)
91{
92	if (!str)
93		return -EINVAL;
94
95	while (*str) {
96		if (!strncmp(str, "nobypass", 8)) {
97			pnv_iommu_bypass_disabled = true;
98			pr_info("PowerNV: IOMMU bypass window disabled.\n");
99			break;
100		}
101		str += strcspn(str, ",");
102		if (*str == ',')
103			str++;
104	}
105
106	return 0;
107}
108early_param("iommu", iommu_setup);
109
110/*
111 * stdcix is only supposed to be used in hypervisor real mode as per
112 * the architecture spec
113 */
114static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
115{
116	__asm__ __volatile__("stdcix %0,0,%1"
117		: : "r" (val), "r" (paddr) : "memory");
118}
119
120static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
121{
122	return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
123		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
124}
125
126static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
127{
128	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
129		pr_warn("%s: Invalid PE %d on PHB#%x\n",
130			__func__, pe_no, phb->hose->global_number);
131		return;
132	}
133
134	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
135		pr_warn("%s: PE %d was assigned on PHB#%x\n",
136			__func__, pe_no, phb->hose->global_number);
137		return;
138	}
139
140	phb->ioda.pe_array[pe_no].phb = phb;
141	phb->ioda.pe_array[pe_no].pe_number = pe_no;
142}
143
144static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
145{
146	unsigned long pe;
147
148	do {
149		pe = find_next_zero_bit(phb->ioda.pe_alloc,
150					phb->ioda.total_pe, 0);
151		if (pe >= phb->ioda.total_pe)
152			return IODA_INVALID_PE;
153	} while(test_and_set_bit(pe, phb->ioda.pe_alloc));
154
155	phb->ioda.pe_array[pe].phb = phb;
156	phb->ioda.pe_array[pe].pe_number = pe;
157	return pe;
158}
159
160static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
161{
162	WARN_ON(phb->ioda.pe_array[pe].pdev);
163
164	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
165	clear_bit(pe, phb->ioda.pe_alloc);
166}
167
168/* The default M64 BAR is shared by all PEs */
169static int pnv_ioda2_init_m64(struct pnv_phb *phb)
170{
171	const char *desc;
172	struct resource *r;
173	s64 rc;
174
175	/* Configure the default M64 BAR */
176	rc = opal_pci_set_phb_mem_window(phb->opal_id,
177					 OPAL_M64_WINDOW_TYPE,
178					 phb->ioda.m64_bar_idx,
179					 phb->ioda.m64_base,
180					 0, /* unused */
181					 phb->ioda.m64_size);
182	if (rc != OPAL_SUCCESS) {
183		desc = "configuring";
184		goto fail;
185	}
186
187	/* Enable the default M64 BAR */
188	rc = opal_pci_phb_mmio_enable(phb->opal_id,
189				      OPAL_M64_WINDOW_TYPE,
190				      phb->ioda.m64_bar_idx,
191				      OPAL_ENABLE_M64_SPLIT);
192	if (rc != OPAL_SUCCESS) {
193		desc = "enabling";
194		goto fail;
195	}
196
197	/* Mark the M64 BAR assigned */
198	set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
199
200	/*
201	 * Strip off the segment used by the reserved PE, which is
202	 * expected to be 0 or last one of PE capabicity.
203	 */
204	r = &phb->hose->mem_resources[1];
205	if (phb->ioda.reserved_pe == 0)
206		r->start += phb->ioda.m64_segsize;
207	else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
208		r->end -= phb->ioda.m64_segsize;
209	else
210		pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
211			phb->ioda.reserved_pe);
212
213	return 0;
214
215fail:
216	pr_warn("  Failure %lld %s M64 BAR#%d\n",
217		rc, desc, phb->ioda.m64_bar_idx);
218	opal_pci_phb_mmio_enable(phb->opal_id,
219				 OPAL_M64_WINDOW_TYPE,
220				 phb->ioda.m64_bar_idx,
221				 OPAL_DISABLE_M64);
222	return -EIO;
223}
224
225static void pnv_ioda2_reserve_m64_pe(struct pnv_phb *phb)
226{
227	resource_size_t sgsz = phb->ioda.m64_segsize;
228	struct pci_dev *pdev;
229	struct resource *r;
230	int base, step, i;
231
232	/*
233	 * Root bus always has full M64 range and root port has
234	 * M64 range used in reality. So we're checking root port
235	 * instead of root bus.
236	 */
237	list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) {
238		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
239			r = &pdev->resource[PCI_BRIDGE_RESOURCES + i];
240			if (!r->parent ||
241			    !pnv_pci_is_mem_pref_64(r->flags))
242				continue;
243
244			base = (r->start - phb->ioda.m64_base) / sgsz;
245			for (step = 0; step < resource_size(r) / sgsz; step++)
246				pnv_ioda_reserve_pe(phb, base + step);
247		}
248	}
249}
250
251static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb,
252				 struct pci_bus *bus, int all)
253{
254	resource_size_t segsz = phb->ioda.m64_segsize;
255	struct pci_dev *pdev;
256	struct resource *r;
257	struct pnv_ioda_pe *master_pe, *pe;
258	unsigned long size, *pe_alloc;
259	bool found;
260	int start, i, j;
261
262	/* Root bus shouldn't use M64 */
263	if (pci_is_root_bus(bus))
264		return IODA_INVALID_PE;
265
266	/* We support only one M64 window on each bus */
267	found = false;
268	pci_bus_for_each_resource(bus, r, i) {
269		if (r && r->parent &&
270		    pnv_pci_is_mem_pref_64(r->flags)) {
271			found = true;
272			break;
273		}
274	}
275
276	/* No M64 window found ? */
277	if (!found)
278		return IODA_INVALID_PE;
279
280	/* Allocate bitmap */
281	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
282	pe_alloc = kzalloc(size, GFP_KERNEL);
283	if (!pe_alloc) {
284		pr_warn("%s: Out of memory !\n",
285			__func__);
286		return IODA_INVALID_PE;
287	}
288
289	/*
290	 * Figure out reserved PE numbers by the PE
291	 * the its child PEs.
292	 */
293	start = (r->start - phb->ioda.m64_base) / segsz;
294	for (i = 0; i < resource_size(r) / segsz; i++)
295		set_bit(start + i, pe_alloc);
296
297	if (all)
298		goto done;
299
300	/*
301	 * If the PE doesn't cover all subordinate buses,
302	 * we need subtract from reserved PEs for children.
303	 */
304	list_for_each_entry(pdev, &bus->devices, bus_list) {
305		if (!pdev->subordinate)
306			continue;
307
308		pci_bus_for_each_resource(pdev->subordinate, r, i) {
309			if (!r || !r->parent ||
310			    !pnv_pci_is_mem_pref_64(r->flags))
311				continue;
312
313			start = (r->start - phb->ioda.m64_base) / segsz;
314			for (j = 0; j < resource_size(r) / segsz ; j++)
315				clear_bit(start + j, pe_alloc);
316                }
317        }
318
319	/*
320	 * the current bus might not own M64 window and that's all
321	 * contributed by its child buses. For the case, we needn't
322	 * pick M64 dependent PE#.
323	 */
324	if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
325		kfree(pe_alloc);
326		return IODA_INVALID_PE;
327	}
328
329	/*
330	 * Figure out the master PE and put all slave PEs to master
331	 * PE's list to form compound PE.
332	 */
333done:
334	master_pe = NULL;
335	i = -1;
336	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
337		phb->ioda.total_pe) {
338		pe = &phb->ioda.pe_array[i];
339
340		if (!master_pe) {
341			pe->flags |= PNV_IODA_PE_MASTER;
342			INIT_LIST_HEAD(&pe->slaves);
343			master_pe = pe;
344		} else {
345			pe->flags |= PNV_IODA_PE_SLAVE;
346			pe->master = master_pe;
347			list_add_tail(&pe->list, &master_pe->slaves);
348		}
349	}
350
351	kfree(pe_alloc);
352	return master_pe->pe_number;
353}
354
355static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
356{
357	struct pci_controller *hose = phb->hose;
358	struct device_node *dn = hose->dn;
359	struct resource *res;
360	const u32 *r;
361	u64 pci_addr;
362
363	/* FIXME: Support M64 for P7IOC */
364	if (phb->type != PNV_PHB_IODA2) {
365		pr_info("  Not support M64 window\n");
366		return;
367	}
368
369	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
370		pr_info("  Firmware too old to support M64 window\n");
371		return;
372	}
373
374	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
375	if (!r) {
376		pr_info("  No <ibm,opal-m64-window> on %s\n",
377			dn->full_name);
378		return;
379	}
380
381	res = &hose->mem_resources[1];
382	res->start = of_translate_address(dn, r + 2);
383	res->end = res->start + of_read_number(r + 4, 2) - 1;
384	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
385	pci_addr = of_read_number(r, 2);
386	hose->mem_offset[1] = res->start - pci_addr;
387
388	phb->ioda.m64_size = resource_size(res);
389	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
390	phb->ioda.m64_base = pci_addr;
391
392	pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
393			res->start, res->end, pci_addr);
394
395	/* Use last M64 BAR to cover M64 window */
396	phb->ioda.m64_bar_idx = 15;
397	phb->init_m64 = pnv_ioda2_init_m64;
398	phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
399	phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
400}
401
402static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
403{
404	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
405	struct pnv_ioda_pe *slave;
406	s64 rc;
407
408	/* Fetch master PE */
409	if (pe->flags & PNV_IODA_PE_SLAVE) {
410		pe = pe->master;
411		if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
412			return;
413
414		pe_no = pe->pe_number;
415	}
416
417	/* Freeze master PE */
418	rc = opal_pci_eeh_freeze_set(phb->opal_id,
419				     pe_no,
420				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
421	if (rc != OPAL_SUCCESS) {
422		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
423			__func__, rc, phb->hose->global_number, pe_no);
424		return;
425	}
426
427	/* Freeze slave PEs */
428	if (!(pe->flags & PNV_IODA_PE_MASTER))
429		return;
430
431	list_for_each_entry(slave, &pe->slaves, list) {
432		rc = opal_pci_eeh_freeze_set(phb->opal_id,
433					     slave->pe_number,
434					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
435		if (rc != OPAL_SUCCESS)
436			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
437				__func__, rc, phb->hose->global_number,
438				slave->pe_number);
439	}
440}
441
442static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
443{
444	struct pnv_ioda_pe *pe, *slave;
445	s64 rc;
446
447	/* Find master PE */
448	pe = &phb->ioda.pe_array[pe_no];
449	if (pe->flags & PNV_IODA_PE_SLAVE) {
450		pe = pe->master;
451		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
452		pe_no = pe->pe_number;
453	}
454
455	/* Clear frozen state for master PE */
456	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
457	if (rc != OPAL_SUCCESS) {
458		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
459			__func__, rc, opt, phb->hose->global_number, pe_no);
460		return -EIO;
461	}
462
463	if (!(pe->flags & PNV_IODA_PE_MASTER))
464		return 0;
465
466	/* Clear frozen state for slave PEs */
467	list_for_each_entry(slave, &pe->slaves, list) {
468		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
469					     slave->pe_number,
470					     opt);
471		if (rc != OPAL_SUCCESS) {
472			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
473				__func__, rc, opt, phb->hose->global_number,
474				slave->pe_number);
475			return -EIO;
476		}
477	}
478
479	return 0;
480}
481
482static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
483{
484	struct pnv_ioda_pe *slave, *pe;
485	u8 fstate, state;
486	__be16 pcierr;
487	s64 rc;
488
489	/* Sanity check on PE number */
490	if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
491		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
492
493	/*
494	 * Fetch the master PE and the PE instance might be
495	 * not initialized yet.
496	 */
497	pe = &phb->ioda.pe_array[pe_no];
498	if (pe->flags & PNV_IODA_PE_SLAVE) {
499		pe = pe->master;
500		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
501		pe_no = pe->pe_number;
502	}
503
504	/* Check the master PE */
505	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
506					&state, &pcierr, NULL);
507	if (rc != OPAL_SUCCESS) {
508		pr_warn("%s: Failure %lld getting "
509			"PHB#%x-PE#%x state\n",
510			__func__, rc,
511			phb->hose->global_number, pe_no);
512		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
513	}
514
515	/* Check the slave PE */
516	if (!(pe->flags & PNV_IODA_PE_MASTER))
517		return state;
518
519	list_for_each_entry(slave, &pe->slaves, list) {
520		rc = opal_pci_eeh_freeze_status(phb->opal_id,
521						slave->pe_number,
522						&fstate,
523						&pcierr,
524						NULL);
525		if (rc != OPAL_SUCCESS) {
526			pr_warn("%s: Failure %lld getting "
527				"PHB#%x-PE#%x state\n",
528				__func__, rc,
529				phb->hose->global_number, slave->pe_number);
530			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
531		}
532
533		/*
534		 * Override the result based on the ascending
535		 * priority.
536		 */
537		if (fstate > state)
538			state = fstate;
539	}
540
541	return state;
542}
543
544/* Currently those 2 are only used when MSIs are enabled, this will change
545 * but in the meantime, we need to protect them to avoid warnings
546 */
547#ifdef CONFIG_PCI_MSI
548static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
549{
550	struct pci_controller *hose = pci_bus_to_host(dev->bus);
551	struct pnv_phb *phb = hose->private_data;
552	struct pci_dn *pdn = pci_get_pdn(dev);
553
554	if (!pdn)
555		return NULL;
556	if (pdn->pe_number == IODA_INVALID_PE)
557		return NULL;
558	return &phb->ioda.pe_array[pdn->pe_number];
559}
560#endif /* CONFIG_PCI_MSI */
561
562static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
563				  struct pnv_ioda_pe *parent,
564				  struct pnv_ioda_pe *child,
565				  bool is_add)
566{
567	const char *desc = is_add ? "adding" : "removing";
568	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
569			      OPAL_REMOVE_PE_FROM_DOMAIN;
570	struct pnv_ioda_pe *slave;
571	long rc;
572
573	/* Parent PE affects child PE */
574	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
575				child->pe_number, op);
576	if (rc != OPAL_SUCCESS) {
577		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
578			rc, desc);
579		return -ENXIO;
580	}
581
582	if (!(child->flags & PNV_IODA_PE_MASTER))
583		return 0;
584
585	/* Compound case: parent PE affects slave PEs */
586	list_for_each_entry(slave, &child->slaves, list) {
587		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
588					slave->pe_number, op);
589		if (rc != OPAL_SUCCESS) {
590			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
591				rc, desc);
592			return -ENXIO;
593		}
594	}
595
596	return 0;
597}
598
599static int pnv_ioda_set_peltv(struct pnv_phb *phb,
600			      struct pnv_ioda_pe *pe,
601			      bool is_add)
602{
603	struct pnv_ioda_pe *slave;
604	struct pci_dev *pdev = NULL;
605	int ret;
606
607	/*
608	 * Clear PE frozen state. If it's master PE, we need
609	 * clear slave PE frozen state as well.
610	 */
611	if (is_add) {
612		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
613					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
614		if (pe->flags & PNV_IODA_PE_MASTER) {
615			list_for_each_entry(slave, &pe->slaves, list)
616				opal_pci_eeh_freeze_clear(phb->opal_id,
617							  slave->pe_number,
618							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
619		}
620	}
621
622	/*
623	 * Associate PE in PELT. We need add the PE into the
624	 * corresponding PELT-V as well. Otherwise, the error
625	 * originated from the PE might contribute to other
626	 * PEs.
627	 */
628	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
629	if (ret)
630		return ret;
631
632	/* For compound PEs, any one affects all of them */
633	if (pe->flags & PNV_IODA_PE_MASTER) {
634		list_for_each_entry(slave, &pe->slaves, list) {
635			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
636			if (ret)
637				return ret;
638		}
639	}
640
641	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
642		pdev = pe->pbus->self;
643	else if (pe->flags & PNV_IODA_PE_DEV)
644		pdev = pe->pdev->bus->self;
645#ifdef CONFIG_PCI_IOV
646	else if (pe->flags & PNV_IODA_PE_VF)
647		pdev = pe->parent_dev->bus->self;
648#endif /* CONFIG_PCI_IOV */
649	while (pdev) {
650		struct pci_dn *pdn = pci_get_pdn(pdev);
651		struct pnv_ioda_pe *parent;
652
653		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
654			parent = &phb->ioda.pe_array[pdn->pe_number];
655			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
656			if (ret)
657				return ret;
658		}
659
660		pdev = pdev->bus->self;
661	}
662
663	return 0;
664}
665
666#ifdef CONFIG_PCI_IOV
667static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
668{
669	struct pci_dev *parent;
670	uint8_t bcomp, dcomp, fcomp;
671	int64_t rc;
672	long rid_end, rid;
673
674	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
675	if (pe->pbus) {
676		int count;
677
678		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
679		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
680		parent = pe->pbus->self;
681		if (pe->flags & PNV_IODA_PE_BUS_ALL)
682			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
683		else
684			count = 1;
685
686		switch(count) {
687		case  1: bcomp = OpalPciBusAll;         break;
688		case  2: bcomp = OpalPciBus7Bits;       break;
689		case  4: bcomp = OpalPciBus6Bits;       break;
690		case  8: bcomp = OpalPciBus5Bits;       break;
691		case 16: bcomp = OpalPciBus4Bits;       break;
692		case 32: bcomp = OpalPciBus3Bits;       break;
693		default:
694			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
695			        count);
696			/* Do an exact match only */
697			bcomp = OpalPciBusAll;
698		}
699		rid_end = pe->rid + (count << 8);
700	} else {
701		if (pe->flags & PNV_IODA_PE_VF)
702			parent = pe->parent_dev;
703		else
704			parent = pe->pdev->bus->self;
705		bcomp = OpalPciBusAll;
706		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
707		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
708		rid_end = pe->rid + 1;
709	}
710
711	/* Clear the reverse map */
712	for (rid = pe->rid; rid < rid_end; rid++)
713		phb->ioda.pe_rmap[rid] = 0;
714
715	/* Release from all parents PELT-V */
716	while (parent) {
717		struct pci_dn *pdn = pci_get_pdn(parent);
718		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
719			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
720						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
721			/* XXX What to do in case of error ? */
722		}
723		parent = parent->bus->self;
724	}
725
726	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
727				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
728
729	/* Disassociate PE in PELT */
730	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
731				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
732	if (rc)
733		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
734	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
735			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
736	if (rc)
737		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
738
739	pe->pbus = NULL;
740	pe->pdev = NULL;
741	pe->parent_dev = NULL;
742
743	return 0;
744}
745#endif /* CONFIG_PCI_IOV */
746
747static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
748{
749	struct pci_dev *parent;
750	uint8_t bcomp, dcomp, fcomp;
751	long rc, rid_end, rid;
752
753	/* Bus validation ? */
754	if (pe->pbus) {
755		int count;
756
757		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
758		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
759		parent = pe->pbus->self;
760		if (pe->flags & PNV_IODA_PE_BUS_ALL)
761			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
762		else
763			count = 1;
764
765		switch(count) {
766		case  1: bcomp = OpalPciBusAll;		break;
767		case  2: bcomp = OpalPciBus7Bits;	break;
768		case  4: bcomp = OpalPciBus6Bits;	break;
769		case  8: bcomp = OpalPciBus5Bits;	break;
770		case 16: bcomp = OpalPciBus4Bits;	break;
771		case 32: bcomp = OpalPciBus3Bits;	break;
772		default:
773			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
774			        count);
775			/* Do an exact match only */
776			bcomp = OpalPciBusAll;
777		}
778		rid_end = pe->rid + (count << 8);
779	} else {
780#ifdef CONFIG_PCI_IOV
781		if (pe->flags & PNV_IODA_PE_VF)
782			parent = pe->parent_dev;
783		else
784#endif /* CONFIG_PCI_IOV */
785			parent = pe->pdev->bus->self;
786		bcomp = OpalPciBusAll;
787		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
788		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
789		rid_end = pe->rid + 1;
790	}
791
792	/*
793	 * Associate PE in PELT. We need add the PE into the
794	 * corresponding PELT-V as well. Otherwise, the error
795	 * originated from the PE might contribute to other
796	 * PEs.
797	 */
798	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
799			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
800	if (rc) {
801		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
802		return -ENXIO;
803	}
804
805	/* Configure PELTV */
806	pnv_ioda_set_peltv(phb, pe, true);
807
808	/* Setup reverse map */
809	for (rid = pe->rid; rid < rid_end; rid++)
810		phb->ioda.pe_rmap[rid] = pe->pe_number;
811
812	/* Setup one MVTs on IODA1 */
813	if (phb->type != PNV_PHB_IODA1) {
814		pe->mve_number = 0;
815		goto out;
816	}
817
818	pe->mve_number = pe->pe_number;
819	rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
820	if (rc != OPAL_SUCCESS) {
821		pe_err(pe, "OPAL error %ld setting up MVE %d\n",
822		       rc, pe->mve_number);
823		pe->mve_number = -1;
824	} else {
825		rc = opal_pci_set_mve_enable(phb->opal_id,
826					     pe->mve_number, OPAL_ENABLE_MVE);
827		if (rc) {
828			pe_err(pe, "OPAL error %ld enabling MVE %d\n",
829			       rc, pe->mve_number);
830			pe->mve_number = -1;
831		}
832	}
833
834out:
835	return 0;
836}
837
838static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
839				       struct pnv_ioda_pe *pe)
840{
841	struct pnv_ioda_pe *lpe;
842
843	list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
844		if (lpe->dma_weight < pe->dma_weight) {
845			list_add_tail(&pe->dma_link, &lpe->dma_link);
846			return;
847		}
848	}
849	list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
850}
851
852static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
853{
854	/* This is quite simplistic. The "base" weight of a device
855	 * is 10. 0 means no DMA is to be accounted for it.
856	 */
857
858	/* If it's a bridge, no DMA */
859	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
860		return 0;
861
862	/* Reduce the weight of slow USB controllers */
863	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
864	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
865	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
866		return 3;
867
868	/* Increase the weight of RAID (includes Obsidian) */
869	if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
870		return 15;
871
872	/* Default */
873	return 10;
874}
875
876#ifdef CONFIG_PCI_IOV
877static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
878{
879	struct pci_dn *pdn = pci_get_pdn(dev);
880	int i;
881	struct resource *res, res2;
882	resource_size_t size;
883	u16 num_vfs;
884
885	if (!dev->is_physfn)
886		return -EINVAL;
887
888	/*
889	 * "offset" is in VFs.  The M64 windows are sized so that when they
890	 * are segmented, each segment is the same size as the IOV BAR.
891	 * Each segment is in a separate PE, and the high order bits of the
892	 * address are the PE number.  Therefore, each VF's BAR is in a
893	 * separate PE, and changing the IOV BAR start address changes the
894	 * range of PEs the VFs are in.
895	 */
896	num_vfs = pdn->num_vfs;
897	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
898		res = &dev->resource[i + PCI_IOV_RESOURCES];
899		if (!res->flags || !res->parent)
900			continue;
901
902		if (!pnv_pci_is_mem_pref_64(res->flags))
903			continue;
904
905		/*
906		 * The actual IOV BAR range is determined by the start address
907		 * and the actual size for num_vfs VFs BAR.  This check is to
908		 * make sure that after shifting, the range will not overlap
909		 * with another device.
910		 */
911		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
912		res2.flags = res->flags;
913		res2.start = res->start + (size * offset);
914		res2.end = res2.start + (size * num_vfs) - 1;
915
916		if (res2.end > res->end) {
917			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
918				i, &res2, res, num_vfs, offset);
919			return -EBUSY;
920		}
921	}
922
923	/*
924	 * After doing so, there would be a "hole" in the /proc/iomem when
925	 * offset is a positive value. It looks like the device return some
926	 * mmio back to the system, which actually no one could use it.
927	 */
928	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
929		res = &dev->resource[i + PCI_IOV_RESOURCES];
930		if (!res->flags || !res->parent)
931			continue;
932
933		if (!pnv_pci_is_mem_pref_64(res->flags))
934			continue;
935
936		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
937		res2 = *res;
938		res->start += size * offset;
939
940		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
941			 i, &res2, res, num_vfs, offset);
942		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
943	}
944	return 0;
945}
946#endif /* CONFIG_PCI_IOV */
947
948#if 0
949static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
950{
951	struct pci_controller *hose = pci_bus_to_host(dev->bus);
952	struct pnv_phb *phb = hose->private_data;
953	struct pci_dn *pdn = pci_get_pdn(dev);
954	struct pnv_ioda_pe *pe;
955	int pe_num;
956
957	if (!pdn) {
958		pr_err("%s: Device tree node not associated properly\n",
959			   pci_name(dev));
960		return NULL;
961	}
962	if (pdn->pe_number != IODA_INVALID_PE)
963		return NULL;
964
965	/* PE#0 has been pre-set */
966	if (dev->bus->number == 0)
967		pe_num = 0;
968	else
969		pe_num = pnv_ioda_alloc_pe(phb);
970	if (pe_num == IODA_INVALID_PE) {
971		pr_warning("%s: Not enough PE# available, disabling device\n",
972			   pci_name(dev));
973		return NULL;
974	}
975
976	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the
977	 * pointer in the PE data structure, both should be destroyed at the
978	 * same time. However, this needs to be looked at more closely again
979	 * once we actually start removing things (Hotplug, SR-IOV, ...)
980	 *
981	 * At some point we want to remove the PDN completely anyways
982	 */
983	pe = &phb->ioda.pe_array[pe_num];
984	pci_dev_get(dev);
985	pdn->pcidev = dev;
986	pdn->pe_number = pe_num;
987	pe->pdev = dev;
988	pe->pbus = NULL;
989	pe->tce32_seg = -1;
990	pe->mve_number = -1;
991	pe->rid = dev->bus->number << 8 | pdn->devfn;
992
993	pe_info(pe, "Associated device to PE\n");
994
995	if (pnv_ioda_configure_pe(phb, pe)) {
996		/* XXX What do we do here ? */
997		if (pe_num)
998			pnv_ioda_free_pe(phb, pe_num);
999		pdn->pe_number = IODA_INVALID_PE;
1000		pe->pdev = NULL;
1001		pci_dev_put(dev);
1002		return NULL;
1003	}
1004
1005	/* Assign a DMA weight to the device */
1006	pe->dma_weight = pnv_ioda_dma_weight(dev);
1007	if (pe->dma_weight != 0) {
1008		phb->ioda.dma_weight += pe->dma_weight;
1009		phb->ioda.dma_pe_count++;
1010	}
1011
1012	/* Link the PE */
1013	pnv_ioda_link_pe_by_weight(phb, pe);
1014
1015	return pe;
1016}
1017#endif /* Useful for SRIOV case */
1018
1019static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1020{
1021	struct pci_dev *dev;
1022
1023	list_for_each_entry(dev, &bus->devices, bus_list) {
1024		struct pci_dn *pdn = pci_get_pdn(dev);
1025
1026		if (pdn == NULL) {
1027			pr_warn("%s: No device node associated with device !\n",
1028				pci_name(dev));
1029			continue;
1030		}
1031		pdn->pe_number = pe->pe_number;
1032		pe->dma_weight += pnv_ioda_dma_weight(dev);
1033		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1034			pnv_ioda_setup_same_PE(dev->subordinate, pe);
1035	}
1036}
1037
1038/*
1039 * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1040 * single PCI bus. Another one that contains the primary PCI bus and its
1041 * subordinate PCI devices and buses. The second type of PE is normally
1042 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1043 */
1044static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
1045{
1046	struct pci_controller *hose = pci_bus_to_host(bus);
1047	struct pnv_phb *phb = hose->private_data;
1048	struct pnv_ioda_pe *pe;
1049	int pe_num = IODA_INVALID_PE;
1050
1051	/* Check if PE is determined by M64 */
1052	if (phb->pick_m64_pe)
1053		pe_num = phb->pick_m64_pe(phb, bus, all);
1054
1055	/* The PE number isn't pinned by M64 */
1056	if (pe_num == IODA_INVALID_PE)
1057		pe_num = pnv_ioda_alloc_pe(phb);
1058
1059	if (pe_num == IODA_INVALID_PE) {
1060		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1061			__func__, pci_domain_nr(bus), bus->number);
1062		return;
1063	}
1064
1065	pe = &phb->ioda.pe_array[pe_num];
1066	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1067	pe->pbus = bus;
1068	pe->pdev = NULL;
1069	pe->tce32_seg = -1;
1070	pe->mve_number = -1;
1071	pe->rid = bus->busn_res.start << 8;
1072	pe->dma_weight = 0;
1073
1074	if (all)
1075		pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
1076			bus->busn_res.start, bus->busn_res.end, pe_num);
1077	else
1078		pe_info(pe, "Secondary bus %d associated with PE#%d\n",
1079			bus->busn_res.start, pe_num);
1080
1081	if (pnv_ioda_configure_pe(phb, pe)) {
1082		/* XXX What do we do here ? */
1083		if (pe_num)
1084			pnv_ioda_free_pe(phb, pe_num);
1085		pe->pbus = NULL;
1086		return;
1087	}
1088
1089	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
1090			GFP_KERNEL, hose->node);
1091	pe->tce32_table->data = pe;
1092
1093	/* Associate it with all child devices */
1094	pnv_ioda_setup_same_PE(bus, pe);
1095
1096	/* Put PE to the list */
1097	list_add_tail(&pe->list, &phb->ioda.pe_list);
1098
1099	/* Account for one DMA PE if at least one DMA capable device exist
1100	 * below the bridge
1101	 */
1102	if (pe->dma_weight != 0) {
1103		phb->ioda.dma_weight += pe->dma_weight;
1104		phb->ioda.dma_pe_count++;
1105	}
1106
1107	/* Link the PE */
1108	pnv_ioda_link_pe_by_weight(phb, pe);
1109}
1110
1111static void pnv_ioda_setup_PEs(struct pci_bus *bus)
1112{
1113	struct pci_dev *dev;
1114
1115	pnv_ioda_setup_bus_PE(bus, 0);
1116
1117	list_for_each_entry(dev, &bus->devices, bus_list) {
1118		if (dev->subordinate) {
1119			if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
1120				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
1121			else
1122				pnv_ioda_setup_PEs(dev->subordinate);
1123		}
1124	}
1125}
1126
1127/*
1128 * Configure PEs so that the downstream PCI buses and devices
1129 * could have their associated PE#. Unfortunately, we didn't
1130 * figure out the way to identify the PLX bridge yet. So we
1131 * simply put the PCI bus and the subordinate behind the root
1132 * port to PE# here. The game rule here is expected to be changed
1133 * as soon as we can detected PLX bridge correctly.
1134 */
1135static void pnv_pci_ioda_setup_PEs(void)
1136{
1137	struct pci_controller *hose, *tmp;
1138	struct pnv_phb *phb;
1139
1140	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1141		phb = hose->private_data;
1142
1143		/* M64 layout might affect PE allocation */
1144		if (phb->reserve_m64_pe)
1145			phb->reserve_m64_pe(phb);
1146
1147		pnv_ioda_setup_PEs(hose->bus);
1148	}
1149}
1150
1151#ifdef CONFIG_PCI_IOV
1152static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
1153{
1154	struct pci_bus        *bus;
1155	struct pci_controller *hose;
1156	struct pnv_phb        *phb;
1157	struct pci_dn         *pdn;
1158	int                    i, j;
1159
1160	bus = pdev->bus;
1161	hose = pci_bus_to_host(bus);
1162	phb = hose->private_data;
1163	pdn = pci_get_pdn(pdev);
1164
1165	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1166		for (j = 0; j < M64_PER_IOV; j++) {
1167			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
1168				continue;
1169			opal_pci_phb_mmio_enable(phb->opal_id,
1170				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
1171			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
1172			pdn->m64_wins[i][j] = IODA_INVALID_M64;
1173		}
1174
1175	return 0;
1176}
1177
1178static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1179{
1180	struct pci_bus        *bus;
1181	struct pci_controller *hose;
1182	struct pnv_phb        *phb;
1183	struct pci_dn         *pdn;
1184	unsigned int           win;
1185	struct resource       *res;
1186	int                    i, j;
1187	int64_t                rc;
1188	int                    total_vfs;
1189	resource_size_t        size, start;
1190	int                    pe_num;
1191	int                    vf_groups;
1192	int                    vf_per_group;
1193
1194	bus = pdev->bus;
1195	hose = pci_bus_to_host(bus);
1196	phb = hose->private_data;
1197	pdn = pci_get_pdn(pdev);
1198	total_vfs = pci_sriov_get_totalvfs(pdev);
1199
1200	/* Initialize the m64_wins to IODA_INVALID_M64 */
1201	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1202		for (j = 0; j < M64_PER_IOV; j++)
1203			pdn->m64_wins[i][j] = IODA_INVALID_M64;
1204
1205	if (pdn->m64_per_iov == M64_PER_IOV) {
1206		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
1207		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
1208			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1209	} else {
1210		vf_groups = 1;
1211		vf_per_group = 1;
1212	}
1213
1214	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1215		res = &pdev->resource[i + PCI_IOV_RESOURCES];
1216		if (!res->flags || !res->parent)
1217			continue;
1218
1219		if (!pnv_pci_is_mem_pref_64(res->flags))
1220			continue;
1221
1222		for (j = 0; j < vf_groups; j++) {
1223			do {
1224				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1225						phb->ioda.m64_bar_idx + 1, 0);
1226
1227				if (win >= phb->ioda.m64_bar_idx + 1)
1228					goto m64_failed;
1229			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1230
1231			pdn->m64_wins[i][j] = win;
1232
1233			if (pdn->m64_per_iov == M64_PER_IOV) {
1234				size = pci_iov_resource_size(pdev,
1235							PCI_IOV_RESOURCES + i);
1236				size = size * vf_per_group;
1237				start = res->start + size * j;
1238			} else {
1239				size = resource_size(res);
1240				start = res->start;
1241			}
1242
1243			/* Map the M64 here */
1244			if (pdn->m64_per_iov == M64_PER_IOV) {
1245				pe_num = pdn->offset + j;
1246				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1247						pe_num, OPAL_M64_WINDOW_TYPE,
1248						pdn->m64_wins[i][j], 0);
1249			}
1250
1251			rc = opal_pci_set_phb_mem_window(phb->opal_id,
1252						 OPAL_M64_WINDOW_TYPE,
1253						 pdn->m64_wins[i][j],
1254						 start,
1255						 0, /* unused */
1256						 size);
1257
1258
1259			if (rc != OPAL_SUCCESS) {
1260				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1261					win, rc);
1262				goto m64_failed;
1263			}
1264
1265			if (pdn->m64_per_iov == M64_PER_IOV)
1266				rc = opal_pci_phb_mmio_enable(phb->opal_id,
1267				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
1268			else
1269				rc = opal_pci_phb_mmio_enable(phb->opal_id,
1270				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
1271
1272			if (rc != OPAL_SUCCESS) {
1273				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1274					win, rc);
1275				goto m64_failed;
1276			}
1277		}
1278	}
1279	return 0;
1280
1281m64_failed:
1282	pnv_pci_vf_release_m64(pdev);
1283	return -EBUSY;
1284}
1285
1286static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1287{
1288	struct pci_bus        *bus;
1289	struct pci_controller *hose;
1290	struct pnv_phb        *phb;
1291	struct iommu_table    *tbl;
1292	unsigned long         addr;
1293	int64_t               rc;
1294
1295	bus = dev->bus;
1296	hose = pci_bus_to_host(bus);
1297	phb = hose->private_data;
1298	tbl = pe->tce32_table;
1299	addr = tbl->it_base;
1300
1301	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
1302				   pe->pe_number << 1, 1, __pa(addr),
1303				   0, 0x1000);
1304
1305	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1306				        pe->pe_number,
1307				        (pe->pe_number << 1) + 1,
1308				        pe->tce_bypass_base,
1309				        0);
1310	if (rc)
1311		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
1312
1313	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
1314	free_pages(addr, get_order(TCE32_TABLE_SIZE));
1315	pe->tce32_table = NULL;
1316}
1317
1318static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1319{
1320	struct pci_bus        *bus;
1321	struct pci_controller *hose;
1322	struct pnv_phb        *phb;
1323	struct pnv_ioda_pe    *pe, *pe_n;
1324	struct pci_dn         *pdn;
1325	u16                    vf_index;
1326	int64_t                rc;
1327
1328	bus = pdev->bus;
1329	hose = pci_bus_to_host(bus);
1330	phb = hose->private_data;
1331	pdn = pci_get_pdn(pdev);
1332
1333	if (!pdev->is_physfn)
1334		return;
1335
1336	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
1337		int   vf_group;
1338		int   vf_per_group;
1339		int   vf_index1;
1340
1341		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1342
1343		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
1344			for (vf_index = vf_group * vf_per_group;
1345				vf_index < (vf_group + 1) * vf_per_group &&
1346				vf_index < num_vfs;
1347				vf_index++)
1348				for (vf_index1 = vf_group * vf_per_group;
1349					vf_index1 < (vf_group + 1) * vf_per_group &&
1350					vf_index1 < num_vfs;
1351					vf_index1++){
1352
1353					rc = opal_pci_set_peltv(phb->opal_id,
1354						pdn->offset + vf_index,
1355						pdn->offset + vf_index1,
1356						OPAL_REMOVE_PE_FROM_DOMAIN);
1357
1358					if (rc)
1359					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
1360						__func__,
1361						pdn->offset + vf_index1, rc);
1362				}
1363	}
1364
1365	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1366		if (pe->parent_dev != pdev)
1367			continue;
1368
1369		pnv_pci_ioda2_release_dma_pe(pdev, pe);
1370
1371		/* Remove from list */
1372		mutex_lock(&phb->ioda.pe_list_mutex);
1373		list_del(&pe->list);
1374		mutex_unlock(&phb->ioda.pe_list_mutex);
1375
1376		pnv_ioda_deconfigure_pe(phb, pe);
1377
1378		pnv_ioda_free_pe(phb, pe->pe_number);
1379	}
1380}
1381
1382void pnv_pci_sriov_disable(struct pci_dev *pdev)
1383{
1384	struct pci_bus        *bus;
1385	struct pci_controller *hose;
1386	struct pnv_phb        *phb;
1387	struct pci_dn         *pdn;
1388	struct pci_sriov      *iov;
1389	u16 num_vfs;
1390
1391	bus = pdev->bus;
1392	hose = pci_bus_to_host(bus);
1393	phb = hose->private_data;
1394	pdn = pci_get_pdn(pdev);
1395	iov = pdev->sriov;
1396	num_vfs = pdn->num_vfs;
1397
1398	/* Release VF PEs */
1399	pnv_ioda_release_vf_PE(pdev, num_vfs);
1400
1401	if (phb->type == PNV_PHB_IODA2) {
1402		if (pdn->m64_per_iov == 1)
1403			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
1404
1405		/* Release M64 windows */
1406		pnv_pci_vf_release_m64(pdev);
1407
1408		/* Release PE numbers */
1409		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1410		pdn->offset = 0;
1411	}
1412}
1413
1414static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1415				       struct pnv_ioda_pe *pe);
1416static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1417{
1418	struct pci_bus        *bus;
1419	struct pci_controller *hose;
1420	struct pnv_phb        *phb;
1421	struct pnv_ioda_pe    *pe;
1422	int                    pe_num;
1423	u16                    vf_index;
1424	struct pci_dn         *pdn;
1425	int64_t                rc;
1426
1427	bus = pdev->bus;
1428	hose = pci_bus_to_host(bus);
1429	phb = hose->private_data;
1430	pdn = pci_get_pdn(pdev);
1431
1432	if (!pdev->is_physfn)
1433		return;
1434
1435	/* Reserve PE for each VF */
1436	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1437		pe_num = pdn->offset + vf_index;
1438
1439		pe = &phb->ioda.pe_array[pe_num];
1440		pe->pe_number = pe_num;
1441		pe->phb = phb;
1442		pe->flags = PNV_IODA_PE_VF;
1443		pe->pbus = NULL;
1444		pe->parent_dev = pdev;
1445		pe->tce32_seg = -1;
1446		pe->mve_number = -1;
1447		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
1448			   pci_iov_virtfn_devfn(pdev, vf_index);
1449
1450		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
1451			hose->global_number, pdev->bus->number,
1452			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
1453			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
1454
1455		if (pnv_ioda_configure_pe(phb, pe)) {
1456			/* XXX What do we do here ? */
1457			if (pe_num)
1458				pnv_ioda_free_pe(phb, pe_num);
1459			pe->pdev = NULL;
1460			continue;
1461		}
1462
1463		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
1464				GFP_KERNEL, hose->node);
1465		pe->tce32_table->data = pe;
1466
1467		/* Put PE to the list */
1468		mutex_lock(&phb->ioda.pe_list_mutex);
1469		list_add_tail(&pe->list, &phb->ioda.pe_list);
1470		mutex_unlock(&phb->ioda.pe_list_mutex);
1471
1472		pnv_pci_ioda2_setup_dma_pe(phb, pe);
1473	}
1474
1475	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
1476		int   vf_group;
1477		int   vf_per_group;
1478		int   vf_index1;
1479
1480		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1481
1482		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
1483			for (vf_index = vf_group * vf_per_group;
1484			     vf_index < (vf_group + 1) * vf_per_group &&
1485			     vf_index < num_vfs;
1486			     vf_index++) {
1487				for (vf_index1 = vf_group * vf_per_group;
1488				     vf_index1 < (vf_group + 1) * vf_per_group &&
1489				     vf_index1 < num_vfs;
1490				     vf_index1++) {
1491
1492					rc = opal_pci_set_peltv(phb->opal_id,
1493						pdn->offset + vf_index,
1494						pdn->offset + vf_index1,
1495						OPAL_ADD_PE_TO_DOMAIN);
1496
1497					if (rc)
1498					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
1499						__func__,
1500						pdn->offset + vf_index1, rc);
1501				}
1502			}
1503		}
1504	}
1505}
1506
1507int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1508{
1509	struct pci_bus        *bus;
1510	struct pci_controller *hose;
1511	struct pnv_phb        *phb;
1512	struct pci_dn         *pdn;
1513	int                    ret;
1514
1515	bus = pdev->bus;
1516	hose = pci_bus_to_host(bus);
1517	phb = hose->private_data;
1518	pdn = pci_get_pdn(pdev);
1519
1520	if (phb->type == PNV_PHB_IODA2) {
1521		/* Calculate available PE for required VFs */
1522		mutex_lock(&phb->ioda.pe_alloc_mutex);
1523		pdn->offset = bitmap_find_next_zero_area(
1524			phb->ioda.pe_alloc, phb->ioda.total_pe,
1525			0, num_vfs, 0);
1526		if (pdn->offset >= phb->ioda.total_pe) {
1527			mutex_unlock(&phb->ioda.pe_alloc_mutex);
1528			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1529			pdn->offset = 0;
1530			return -EBUSY;
1531		}
1532		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1533		pdn->num_vfs = num_vfs;
1534		mutex_unlock(&phb->ioda.pe_alloc_mutex);
1535
1536		/* Assign M64 window accordingly */
1537		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1538		if (ret) {
1539			dev_info(&pdev->dev, "Not enough M64 window resources\n");
1540			goto m64_failed;
1541		}
1542
1543		/*
1544		 * When using one M64 BAR to map one IOV BAR, we need to shift
1545		 * the IOV BAR according to the PE# allocated to the VFs.
1546		 * Otherwise, the PE# for the VF will conflict with others.
1547		 */
1548		if (pdn->m64_per_iov == 1) {
1549			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
1550			if (ret)
1551				goto m64_failed;
1552		}
1553	}
1554
1555	/* Setup VF PEs */
1556	pnv_ioda_setup_vf_PE(pdev, num_vfs);
1557
1558	return 0;
1559
1560m64_failed:
1561	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1562	pdn->offset = 0;
1563
1564	return ret;
1565}
1566
1567int pcibios_sriov_disable(struct pci_dev *pdev)
1568{
1569	pnv_pci_sriov_disable(pdev);
1570
1571	/* Release PCI data */
1572	remove_dev_pci_data(pdev);
1573	return 0;
1574}
1575
1576int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1577{
1578	/* Allocate PCI data */
1579	add_dev_pci_data(pdev);
1580
1581	pnv_pci_sriov_enable(pdev, num_vfs);
1582	return 0;
1583}
1584#endif /* CONFIG_PCI_IOV */
1585
1586static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1587{
1588	struct pci_dn *pdn = pci_get_pdn(pdev);
1589	struct pnv_ioda_pe *pe;
1590
1591	/*
1592	 * The function can be called while the PE#
1593	 * hasn't been assigned. Do nothing for the
1594	 * case.
1595	 */
1596	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1597		return;
1598
1599	pe = &phb->ioda.pe_array[pdn->pe_number];
1600	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1601	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
1602}
1603
1604static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
1605				     struct pci_dev *pdev, u64 dma_mask)
1606{
1607	struct pci_dn *pdn = pci_get_pdn(pdev);
1608	struct pnv_ioda_pe *pe;
1609	uint64_t top;
1610	bool bypass = false;
1611
1612	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1613		return -ENODEV;;
1614
1615	pe = &phb->ioda.pe_array[pdn->pe_number];
1616	if (pe->tce_bypass_enabled) {
1617		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1618		bypass = (dma_mask >= top);
1619	}
1620
1621	if (bypass) {
1622		dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
1623		set_dma_ops(&pdev->dev, &dma_direct_ops);
1624		set_dma_offset(&pdev->dev, pe->tce_bypass_base);
1625	} else {
1626		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
1627		set_dma_ops(&pdev->dev, &dma_iommu_ops);
1628		set_iommu_table_base(&pdev->dev, pe->tce32_table);
1629	}
1630	*pdev->dev.dma_mask = dma_mask;
1631	return 0;
1632}
1633
1634static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
1635					      struct pci_dev *pdev)
1636{
1637	struct pci_dn *pdn = pci_get_pdn(pdev);
1638	struct pnv_ioda_pe *pe;
1639	u64 end, mask;
1640
1641	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1642		return 0;
1643
1644	pe = &phb->ioda.pe_array[pdn->pe_number];
1645	if (!pe->tce_bypass_enabled)
1646		return __dma_get_required_mask(&pdev->dev);
1647
1648
1649	end = pe->tce_bypass_base + memblock_end_of_DRAM();
1650	mask = 1ULL << (fls64(end) - 1);
1651	mask += mask - 1;
1652
1653	return mask;
1654}
1655
1656static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
1657				   struct pci_bus *bus,
1658				   bool add_to_iommu_group)
1659{
1660	struct pci_dev *dev;
1661
1662	list_for_each_entry(dev, &bus->devices, bus_list) {
1663		if (add_to_iommu_group)
1664			set_iommu_table_base_and_group(&dev->dev,
1665						       pe->tce32_table);
1666		else
1667			set_iommu_table_base(&dev->dev, pe->tce32_table);
1668
1669		if (dev->subordinate)
1670			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
1671					       add_to_iommu_group);
1672	}
1673}
1674
1675static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
1676					 struct iommu_table *tbl,
1677					 __be64 *startp, __be64 *endp, bool rm)
1678{
1679	__be64 __iomem *invalidate = rm ?
1680		(__be64 __iomem *)pe->tce_inval_reg_phys :
1681		(__be64 __iomem *)tbl->it_index;
1682	unsigned long start, end, inc;
1683	const unsigned shift = tbl->it_page_shift;
1684
1685	start = __pa(startp);
1686	end = __pa(endp);
1687
1688	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
1689	if (tbl->it_busno) {
1690		start <<= shift;
1691		end <<= shift;
1692		inc = 128ull << shift;
1693		start |= tbl->it_busno;
1694		end |= tbl->it_busno;
1695	} else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
1696		/* p7ioc-style invalidation, 2 TCEs per write */
1697		start |= (1ull << 63);
1698		end |= (1ull << 63);
1699		inc = 16;
1700        } else {
1701		/* Default (older HW) */
1702                inc = 128;
1703	}
1704
1705        end |= inc - 1;	/* round up end to be different than start */
1706
1707        mb(); /* Ensure above stores are visible */
1708        while (start <= end) {
1709		if (rm)
1710			__raw_rm_writeq(cpu_to_be64(start), invalidate);
1711		else
1712			__raw_writeq(cpu_to_be64(start), invalidate);
1713                start += inc;
1714        }
1715
1716	/*
1717	 * The iommu layer will do another mb() for us on build()
1718	 * and we don't care on free()
1719	 */
1720}
1721
1722static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
1723					 struct iommu_table *tbl,
1724					 __be64 *startp, __be64 *endp, bool rm)
1725{
1726	unsigned long start, end, inc;
1727	__be64 __iomem *invalidate = rm ?
1728		(__be64 __iomem *)pe->tce_inval_reg_phys :
1729		(__be64 __iomem *)tbl->it_index;
1730	const unsigned shift = tbl->it_page_shift;
1731
1732	/* We'll invalidate DMA address in PE scope */
1733	start = 0x2ull << 60;
1734	start |= (pe->pe_number & 0xFF);
1735	end = start;
1736
1737	/* Figure out the start, end and step */
1738	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
1739	start |= (inc << shift);
1740	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
1741	end |= (inc << shift);
1742	inc = (0x1ull << shift);
1743	mb();
1744
1745	while (start <= end) {
1746		if (rm)
1747			__raw_rm_writeq(cpu_to_be64(start), invalidate);
1748		else
1749			__raw_writeq(cpu_to_be64(start), invalidate);
1750		start += inc;
1751	}
1752}
1753
1754void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
1755				 __be64 *startp, __be64 *endp, bool rm)
1756{
1757	struct pnv_ioda_pe *pe = tbl->data;
1758	struct pnv_phb *phb = pe->phb;
1759
1760	if (phb->type == PNV_PHB_IODA1)
1761		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
1762	else
1763		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
1764}
1765
1766static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
1767				      struct pnv_ioda_pe *pe, unsigned int base,
1768				      unsigned int segs)
1769{
1770
1771	struct page *tce_mem = NULL;
1772	const __be64 *swinvp;
1773	struct iommu_table *tbl;
1774	unsigned int i;
1775	int64_t rc;
1776	void *addr;
1777
1778	/* XXX FIXME: Handle 64-bit only DMA devices */
1779	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
1780	/* XXX FIXME: Allocate multi-level tables on PHB3 */
1781
1782	/* We shouldn't already have a 32-bit DMA associated */
1783	if (WARN_ON(pe->tce32_seg >= 0))
1784		return;
1785
1786	/* Grab a 32-bit TCE table */
1787	pe->tce32_seg = base;
1788	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
1789		(base << 28), ((base + segs) << 28) - 1);
1790
1791	/* XXX Currently, we allocate one big contiguous table for the
1792	 * TCEs. We only really need one chunk per 256M of TCE space
1793	 * (ie per segment) but that's an optimization for later, it
1794	 * requires some added smarts with our get/put_tce implementation
1795	 */
1796	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1797				   get_order(TCE32_TABLE_SIZE * segs));
1798	if (!tce_mem) {
1799		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
1800		goto fail;
1801	}
1802	addr = page_address(tce_mem);
1803	memset(addr, 0, TCE32_TABLE_SIZE * segs);
1804
1805	/* Configure HW */
1806	for (i = 0; i < segs; i++) {
1807		rc = opal_pci_map_pe_dma_window(phb->opal_id,
1808					      pe->pe_number,
1809					      base + i, 1,
1810					      __pa(addr) + TCE32_TABLE_SIZE * i,
1811					      TCE32_TABLE_SIZE, 0x1000);
1812		if (rc) {
1813			pe_err(pe, " Failed to configure 32-bit TCE table,"
1814			       " err %ld\n", rc);
1815			goto fail;
1816		}
1817	}
1818
1819	/* Setup linux iommu table */
1820	tbl = pe->tce32_table;
1821	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
1822				  base << 28, IOMMU_PAGE_SHIFT_4K);
1823
1824	/* OPAL variant of P7IOC SW invalidated TCEs */
1825	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
1826	if (swinvp) {
1827		/* We need a couple more fields -- an address and a data
1828		 * to or.  Since the bus is only printed out on table free
1829		 * errors, and on the first pass the data will be a relative
1830		 * bus number, print that out instead.
1831		 */
1832		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
1833		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
1834				8);
1835		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
1836				 TCE_PCI_SWINV_FREE   |
1837				 TCE_PCI_SWINV_PAIR);
1838	}
1839	iommu_init_table(tbl, phb->hose->node);
1840
1841	if (pe->flags & PNV_IODA_PE_DEV) {
1842		iommu_register_group(tbl, phb->hose->global_number,
1843				     pe->pe_number);
1844		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
1845	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
1846		iommu_register_group(tbl, phb->hose->global_number,
1847				     pe->pe_number);
1848		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
1849	} else if (pe->flags & PNV_IODA_PE_VF) {
1850		iommu_register_group(tbl, phb->hose->global_number,
1851				     pe->pe_number);
1852	}
1853
1854	return;
1855 fail:
1856	/* XXX Failure: Try to fallback to 64-bit only ? */
1857	if (pe->tce32_seg >= 0)
1858		pe->tce32_seg = -1;
1859	if (tce_mem)
1860		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
1861}
1862
1863static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
1864{
1865	struct pnv_ioda_pe *pe = tbl->data;
1866	uint16_t window_id = (pe->pe_number << 1 ) + 1;
1867	int64_t rc;
1868
1869	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
1870	if (enable) {
1871		phys_addr_t top = memblock_end_of_DRAM();
1872
1873		top = roundup_pow_of_two(top);
1874		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1875						     pe->pe_number,
1876						     window_id,
1877						     pe->tce_bypass_base,
1878						     top);
1879	} else {
1880		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1881						     pe->pe_number,
1882						     window_id,
1883						     pe->tce_bypass_base,
1884						     0);
1885
1886		/*
1887		 * EEH needs the mapping between IOMMU table and group
1888		 * of those VFIO/KVM pass-through devices. We can postpone
1889		 * resetting DMA ops until the DMA mask is configured in
1890		 * host side.
1891		 */
1892		if (pe->pdev)
1893			set_iommu_table_base(&pe->pdev->dev, tbl);
1894		else
1895			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
1896	}
1897	if (rc)
1898		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
1899	else
1900		pe->tce_bypass_enabled = enable;
1901}
1902
1903static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
1904					  struct pnv_ioda_pe *pe)
1905{
1906	/* TVE #1 is selected by PCI address bit 59 */
1907	pe->tce_bypass_base = 1ull << 59;
1908
1909	/* Install set_bypass callback for VFIO */
1910	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
1911
1912	/* Enable bypass by default */
1913	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
1914}
1915
1916static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1917				       struct pnv_ioda_pe *pe)
1918{
1919	struct page *tce_mem = NULL;
1920	void *addr;
1921	const __be64 *swinvp;
1922	struct iommu_table *tbl;
1923	unsigned int tce_table_size, end;
1924	int64_t rc;
1925
1926	/* We shouldn't already have a 32-bit DMA associated */
1927	if (WARN_ON(pe->tce32_seg >= 0))
1928		return;
1929
1930	/* The PE will reserve all possible 32-bits space */
1931	pe->tce32_seg = 0;
1932	end = (1 << ilog2(phb->ioda.m32_pci_base));
1933	tce_table_size = (end / 0x1000) * 8;
1934	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
1935		end);
1936
1937	/* Allocate TCE table */
1938	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1939				   get_order(tce_table_size));
1940	if (!tce_mem) {
1941		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
1942		goto fail;
1943	}
1944	addr = page_address(tce_mem);
1945	memset(addr, 0, tce_table_size);
1946
1947	/*
1948	 * Map TCE table through TVT. The TVE index is the PE number
1949	 * shifted by 1 bit for 32-bits DMA space.
1950	 */
1951	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
1952					pe->pe_number << 1, 1, __pa(addr),
1953					tce_table_size, 0x1000);
1954	if (rc) {
1955		pe_err(pe, "Failed to configure 32-bit TCE table,"
1956		       " err %ld\n", rc);
1957		goto fail;
1958	}
1959
1960	/* Setup linux iommu table */
1961	tbl = pe->tce32_table;
1962	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
1963			IOMMU_PAGE_SHIFT_4K);
1964
1965	/* OPAL variant of PHB3 invalidated TCEs */
1966	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
1967	if (swinvp) {
1968		/* We need a couple more fields -- an address and a data
1969		 * to or.  Since the bus is only printed out on table free
1970		 * errors, and on the first pass the data will be a relative
1971		 * bus number, print that out instead.
1972		 */
1973		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
1974		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
1975				8);
1976		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
1977	}
1978	iommu_init_table(tbl, phb->hose->node);
1979
1980	if (pe->flags & PNV_IODA_PE_DEV) {
1981		iommu_register_group(tbl, phb->hose->global_number,
1982				     pe->pe_number);
1983		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
1984	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
1985		iommu_register_group(tbl, phb->hose->global_number,
1986				     pe->pe_number);
1987		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
1988	} else if (pe->flags & PNV_IODA_PE_VF) {
1989		iommu_register_group(tbl, phb->hose->global_number,
1990				     pe->pe_number);
1991	}
1992
1993	/* Also create a bypass window */
1994	if (!pnv_iommu_bypass_disabled)
1995		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
1996
1997	return;
1998fail:
1999	if (pe->tce32_seg >= 0)
2000		pe->tce32_seg = -1;
2001	if (tce_mem)
2002		__free_pages(tce_mem, get_order(tce_table_size));
2003}
2004
2005static void pnv_ioda_setup_dma(struct pnv_phb *phb)
2006{
2007	struct pci_controller *hose = phb->hose;
2008	unsigned int residual, remaining, segs, tw, base;
2009	struct pnv_ioda_pe *pe;
2010
2011	/* If we have more PE# than segments available, hand out one
2012	 * per PE until we run out and let the rest fail. If not,
2013	 * then we assign at least one segment per PE, plus more based
2014	 * on the amount of devices under that PE
2015	 */
2016	if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
2017		residual = 0;
2018	else
2019		residual = phb->ioda.tce32_count -
2020			phb->ioda.dma_pe_count;
2021
2022	pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
2023		hose->global_number, phb->ioda.tce32_count);
2024	pr_info("PCI: %d PE# for a total weight of %d\n",
2025		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
2026
2027	/* Walk our PE list and configure their DMA segments, hand them
2028	 * out one base segment plus any residual segments based on
2029	 * weight
2030	 */
2031	remaining = phb->ioda.tce32_count;
2032	tw = phb->ioda.dma_weight;
2033	base = 0;
2034	list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
2035		if (!pe->dma_weight)
2036			continue;
2037		if (!remaining) {
2038			pe_warn(pe, "No DMA32 resources available\n");
2039			continue;
2040		}
2041		segs = 1;
2042		if (residual) {
2043			segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
2044			if (segs > remaining)
2045				segs = remaining;
2046		}
2047
2048		/*
2049		 * For IODA2 compliant PHB3, we needn't care about the weight.
2050		 * The all available 32-bits DMA space will be assigned to
2051		 * the specific PE.
2052		 */
2053		if (phb->type == PNV_PHB_IODA1) {
2054			pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
2055				pe->dma_weight, segs);
2056			pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
2057		} else {
2058			pe_info(pe, "Assign DMA32 space\n");
2059			segs = 0;
2060			pnv_pci_ioda2_setup_dma_pe(phb, pe);
2061		}
2062
2063		remaining -= segs;
2064		base += segs;
2065	}
2066}
2067
2068#ifdef CONFIG_PCI_MSI
2069static void pnv_ioda2_msi_eoi(struct irq_data *d)
2070{
2071	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2072	struct irq_chip *chip = irq_data_get_irq_chip(d);
2073	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2074					   ioda.irq_chip);
2075	int64_t rc;
2076
2077	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
2078	WARN_ON_ONCE(rc);
2079
2080	icp_native_eoi(d);
2081}
2082
2083
2084static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2085{
2086	struct irq_data *idata;
2087	struct irq_chip *ichip;
2088
2089	if (phb->type != PNV_PHB_IODA2)
2090		return;
2091
2092	if (!phb->ioda.irq_chip_init) {
2093		/*
2094		 * First time we setup an MSI IRQ, we need to setup the
2095		 * corresponding IRQ chip to route correctly.
2096		 */
2097		idata = irq_get_irq_data(virq);
2098		ichip = irq_data_get_irq_chip(idata);
2099		phb->ioda.irq_chip_init = 1;
2100		phb->ioda.irq_chip = *ichip;
2101		phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2102	}
2103	irq_set_chip(virq, &phb->ioda.irq_chip);
2104}
2105
2106#ifdef CONFIG_CXL_BASE
2107
2108struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
2109{
2110	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2111
2112	return of_node_get(hose->dn);
2113}
2114EXPORT_SYMBOL(pnv_pci_get_phb_node);
2115
2116int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
2117{
2118	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2119	struct pnv_phb *phb = hose->private_data;
2120	struct pnv_ioda_pe *pe;
2121	int rc;
2122
2123	pe = pnv_ioda_get_pe(dev);
2124	if (!pe)
2125		return -ENODEV;
2126
2127	pe_info(pe, "Switching PHB to CXL\n");
2128
2129	rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
2130	if (rc)
2131		dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
2132
2133	return rc;
2134}
2135EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
2136
2137/* Find PHB for cxl dev and allocate MSI hwirqs?
2138 * Returns the absolute hardware IRQ number
2139 */
2140int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
2141{
2142	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2143	struct pnv_phb *phb = hose->private_data;
2144	int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
2145
2146	if (hwirq < 0) {
2147		dev_warn(&dev->dev, "Failed to find a free MSI\n");
2148		return -ENOSPC;
2149	}
2150
2151	return phb->msi_base + hwirq;
2152}
2153EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
2154
2155void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
2156{
2157	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2158	struct pnv_phb *phb = hose->private_data;
2159
2160	msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
2161}
2162EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
2163
2164void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
2165				  struct pci_dev *dev)
2166{
2167	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2168	struct pnv_phb *phb = hose->private_data;
2169	int i, hwirq;
2170
2171	for (i = 1; i < CXL_IRQ_RANGES; i++) {
2172		if (!irqs->range[i])
2173			continue;
2174		pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
2175			 i, irqs->offset[i],
2176			 irqs->range[i]);
2177		hwirq = irqs->offset[i] - phb->msi_base;
2178		msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
2179				       irqs->range[i]);
2180	}
2181}
2182EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
2183
2184int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
2185			       struct pci_dev *dev, int num)
2186{
2187	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2188	struct pnv_phb *phb = hose->private_data;
2189	int i, hwirq, try;
2190
2191	memset(irqs, 0, sizeof(struct cxl_irq_ranges));
2192
2193	/* 0 is reserved for the multiplexed PSL DSI interrupt */
2194	for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
2195		try = num;
2196		while (try) {
2197			hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
2198			if (hwirq >= 0)
2199				break;
2200			try /= 2;
2201		}
2202		if (!try)
2203			goto fail;
2204
2205		irqs->offset[i] = phb->msi_base + hwirq;
2206		irqs->range[i] = try;
2207		pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
2208			 i, irqs->offset[i], irqs->range[i]);
2209		num -= try;
2210	}
2211	if (num)
2212		goto fail;
2213
2214	return 0;
2215fail:
2216	pnv_cxl_release_hwirq_ranges(irqs, dev);
2217	return -ENOSPC;
2218}
2219EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
2220
2221int pnv_cxl_get_irq_count(struct pci_dev *dev)
2222{
2223	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2224	struct pnv_phb *phb = hose->private_data;
2225
2226	return phb->msi_bmp.irq_count;
2227}
2228EXPORT_SYMBOL(pnv_cxl_get_irq_count);
2229
2230int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
2231			   unsigned int virq)
2232{
2233	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2234	struct pnv_phb *phb = hose->private_data;
2235	unsigned int xive_num = hwirq - phb->msi_base;
2236	struct pnv_ioda_pe *pe;
2237	int rc;
2238
2239	if (!(pe = pnv_ioda_get_pe(dev)))
2240		return -ENODEV;
2241
2242	/* Assign XIVE to PE */
2243	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2244	if (rc) {
2245		pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
2246			"hwirq 0x%x XIVE 0x%x PE\n",
2247			pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
2248		return -EIO;
2249	}
2250	set_msi_irq_chip(phb, virq);
2251
2252	return 0;
2253}
2254EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
2255#endif
2256
2257static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2258				  unsigned int hwirq, unsigned int virq,
2259				  unsigned int is_64, struct msi_msg *msg)
2260{
2261	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2262	unsigned int xive_num = hwirq - phb->msi_base;
2263	__be32 data;
2264	int rc;
2265
2266	/* No PE assigned ? bail out ... no MSI for you ! */
2267	if (pe == NULL)
2268		return -ENXIO;
2269
2270	/* Check if we have an MVE */
2271	if (pe->mve_number < 0)
2272		return -ENXIO;
2273
2274	/* Force 32-bit MSI on some broken devices */
2275	if (dev->no_64bit_msi)
2276		is_64 = 0;
2277
2278	/* Assign XIVE to PE */
2279	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2280	if (rc) {
2281		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2282			pci_name(dev), rc, xive_num);
2283		return -EIO;
2284	}
2285
2286	if (is_64) {
2287		__be64 addr64;
2288
2289		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2290				     &addr64, &data);
2291		if (rc) {
2292			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2293				pci_name(dev), rc);
2294			return -EIO;
2295		}
2296		msg->address_hi = be64_to_cpu(addr64) >> 32;
2297		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2298	} else {
2299		__be32 addr32;
2300
2301		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2302				     &addr32, &data);
2303		if (rc) {
2304			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2305				pci_name(dev), rc);
2306			return -EIO;
2307		}
2308		msg->address_hi = 0;
2309		msg->address_lo = be32_to_cpu(addr32);
2310	}
2311	msg->data = be32_to_cpu(data);
2312
2313	set_msi_irq_chip(phb, virq);
2314
2315	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2316		 " address=%x_%08x data=%x PE# %d\n",
2317		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2318		 msg->address_hi, msg->address_lo, data, pe->pe_number);
2319
2320	return 0;
2321}
2322
2323static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2324{
2325	unsigned int count;
2326	const __be32 *prop = of_get_property(phb->hose->dn,
2327					     "ibm,opal-msi-ranges", NULL);
2328	if (!prop) {
2329		/* BML Fallback */
2330		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2331	}
2332	if (!prop)
2333		return;
2334
2335	phb->msi_base = be32_to_cpup(prop);
2336	count = be32_to_cpup(prop + 1);
2337	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2338		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2339		       phb->hose->global_number);
2340		return;
2341	}
2342
2343	phb->msi_setup = pnv_pci_ioda_msi_setup;
2344	phb->msi32_support = 1;
2345	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2346		count, phb->msi_base);
2347}
2348#else
2349static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
2350#endif /* CONFIG_PCI_MSI */
2351
2352#ifdef CONFIG_PCI_IOV
2353static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2354{
2355	struct pci_controller *hose;
2356	struct pnv_phb *phb;
2357	struct resource *res;
2358	int i;
2359	resource_size_t size;
2360	struct pci_dn *pdn;
2361	int mul, total_vfs;
2362
2363	if (!pdev->is_physfn || pdev->is_added)
2364		return;
2365
2366	hose = pci_bus_to_host(pdev->bus);
2367	phb = hose->private_data;
2368
2369	pdn = pci_get_pdn(pdev);
2370	pdn->vfs_expanded = 0;
2371
2372	total_vfs = pci_sriov_get_totalvfs(pdev);
2373	pdn->m64_per_iov = 1;
2374	mul = phb->ioda.total_pe;
2375
2376	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2377		res = &pdev->resource[i + PCI_IOV_RESOURCES];
2378		if (!res->flags || res->parent)
2379			continue;
2380		if (!pnv_pci_is_mem_pref_64(res->flags)) {
2381			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
2382				 i, res);
2383			continue;
2384		}
2385
2386		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2387
2388		/* bigger than 64M */
2389		if (size > (1 << 26)) {
2390			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
2391				 i, res);
2392			pdn->m64_per_iov = M64_PER_IOV;
2393			mul = roundup_pow_of_two(total_vfs);
2394			break;
2395		}
2396	}
2397
2398	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2399		res = &pdev->resource[i + PCI_IOV_RESOURCES];
2400		if (!res->flags || res->parent)
2401			continue;
2402		if (!pnv_pci_is_mem_pref_64(res->flags)) {
2403			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
2404				 i, res);
2405			continue;
2406		}
2407
2408		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
2409		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2410		res->end = res->start + size * mul - 1;
2411		dev_dbg(&pdev->dev, "                       %pR\n", res);
2412		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
2413			 i, res, mul);
2414	}
2415	pdn->vfs_expanded = mul;
2416}
2417#endif /* CONFIG_PCI_IOV */
2418
2419/*
2420 * This function is supposed to be called on basis of PE from top
2421 * to bottom style. So the the I/O or MMIO segment assigned to
2422 * parent PE could be overrided by its child PEs if necessary.
2423 */
2424static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
2425				  struct pnv_ioda_pe *pe)
2426{
2427	struct pnv_phb *phb = hose->private_data;
2428	struct pci_bus_region region;
2429	struct resource *res;
2430	int i, index;
2431	int rc;
2432
2433	/*
2434	 * NOTE: We only care PCI bus based PE for now. For PCI
2435	 * device based PE, for example SRIOV sensitive VF should
2436	 * be figured out later.
2437	 */
2438	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
2439
2440	pci_bus_for_each_resource(pe->pbus, res, i) {
2441		if (!res || !res->flags ||
2442		    res->start > res->end)
2443			continue;
2444
2445		if (res->flags & IORESOURCE_IO) {
2446			region.start = res->start - phb->ioda.io_pci_base;
2447			region.end   = res->end - phb->ioda.io_pci_base;
2448			index = region.start / phb->ioda.io_segsize;
2449
2450			while (index < phb->ioda.total_pe &&
2451			       region.start <= region.end) {
2452				phb->ioda.io_segmap[index] = pe->pe_number;
2453				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2454					pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
2455				if (rc != OPAL_SUCCESS) {
2456					pr_err("%s: OPAL error %d when mapping IO "
2457					       "segment #%d to PE#%d\n",
2458					       __func__, rc, index, pe->pe_number);
2459					break;
2460				}
2461
2462				region.start += phb->ioda.io_segsize;
2463				index++;
2464			}
2465		} else if ((res->flags & IORESOURCE_MEM) &&
2466			   !pnv_pci_is_mem_pref_64(res->flags)) {
2467			region.start = res->start -
2468				       hose->mem_offset[0] -
2469				       phb->ioda.m32_pci_base;
2470			region.end   = res->end -
2471				       hose->mem_offset[0] -
2472				       phb->ioda.m32_pci_base;
2473			index = region.start / phb->ioda.m32_segsize;
2474
2475			while (index < phb->ioda.total_pe &&
2476			       region.start <= region.end) {
2477				phb->ioda.m32_segmap[index] = pe->pe_number;
2478				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2479					pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
2480				if (rc != OPAL_SUCCESS) {
2481					pr_err("%s: OPAL error %d when mapping M32 "
2482					       "segment#%d to PE#%d",
2483					       __func__, rc, index, pe->pe_number);
2484					break;
2485				}
2486
2487				region.start += phb->ioda.m32_segsize;
2488				index++;
2489			}
2490		}
2491	}
2492}
2493
2494static void pnv_pci_ioda_setup_seg(void)
2495{
2496	struct pci_controller *tmp, *hose;
2497	struct pnv_phb *phb;
2498	struct pnv_ioda_pe *pe;
2499
2500	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2501		phb = hose->private_data;
2502		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2503			pnv_ioda_setup_pe_seg(hose, pe);
2504		}
2505	}
2506}
2507
2508static void pnv_pci_ioda_setup_DMA(void)
2509{
2510	struct pci_controller *hose, *tmp;
2511	struct pnv_phb *phb;
2512
2513	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2514		pnv_ioda_setup_dma(hose->private_data);
2515
2516		/* Mark the PHB initialization done */
2517		phb = hose->private_data;
2518		phb->initialized = 1;
2519	}
2520}
2521
2522static void pnv_pci_ioda_create_dbgfs(void)
2523{
2524#ifdef CONFIG_DEBUG_FS
2525	struct pci_controller *hose, *tmp;
2526	struct pnv_phb *phb;
2527	char name[16];
2528
2529	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2530		phb = hose->private_data;
2531
2532		sprintf(name, "PCI%04x", hose->global_number);
2533		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
2534		if (!phb->dbgfs)
2535			pr_warning("%s: Error on creating debugfs on PHB#%x\n",
2536				__func__, hose->global_number);
2537	}
2538#endif /* CONFIG_DEBUG_FS */
2539}
2540
2541static void pnv_pci_ioda_fixup(void)
2542{
2543	pnv_pci_ioda_setup_PEs();
2544	pnv_pci_ioda_setup_seg();
2545	pnv_pci_ioda_setup_DMA();
2546
2547	pnv_pci_ioda_create_dbgfs();
2548
2549#ifdef CONFIG_EEH
2550	eeh_init();
2551	eeh_addr_cache_build();
2552#endif
2553}
2554
2555/*
2556 * Returns the alignment for I/O or memory windows for P2P
2557 * bridges. That actually depends on how PEs are segmented.
2558 * For now, we return I/O or M32 segment size for PE sensitive
2559 * P2P bridges. Otherwise, the default values (4KiB for I/O,
2560 * 1MiB for memory) will be returned.
2561 *
2562 * The current PCI bus might be put into one PE, which was
2563 * create against the parent PCI bridge. For that case, we
2564 * needn't enlarge the alignment so that we can save some
2565 * resources.
2566 */
2567static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
2568						unsigned long type)
2569{
2570	struct pci_dev *bridge;
2571	struct pci_controller *hose = pci_bus_to_host(bus);
2572	struct pnv_phb *phb = hose->private_data;
2573	int num_pci_bridges = 0;
2574
2575	bridge = bus->self;
2576	while (bridge) {
2577		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
2578			num_pci_bridges++;
2579			if (num_pci_bridges >= 2)
2580				return 1;
2581		}
2582
2583		bridge = bridge->bus->self;
2584	}
2585
2586	/* We fail back to M32 if M64 isn't supported */
2587	if (phb->ioda.m64_segsize &&
2588	    pnv_pci_is_mem_pref_64(type))
2589		return phb->ioda.m64_segsize;
2590	if (type & IORESOURCE_MEM)
2591		return phb->ioda.m32_segsize;
2592
2593	return phb->ioda.io_segsize;
2594}
2595
2596#ifdef CONFIG_PCI_IOV
2597static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
2598						      int resno)
2599{
2600	struct pci_dn *pdn = pci_get_pdn(pdev);
2601	resource_size_t align, iov_align;
2602
2603	iov_align = resource_size(&pdev->resource[resno]);
2604	if (iov_align)
2605		return iov_align;
2606
2607	align = pci_iov_resource_size(pdev, resno);
2608	if (pdn->vfs_expanded)
2609		return pdn->vfs_expanded * align;
2610
2611	return align;
2612}
2613#endif /* CONFIG_PCI_IOV */
2614
2615/* Prevent enabling devices for which we couldn't properly
2616 * assign a PE
2617 */
2618static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
2619{
2620	struct pci_controller *hose = pci_bus_to_host(dev->bus);
2621	struct pnv_phb *phb = hose->private_data;
2622	struct pci_dn *pdn;
2623
2624	/* The function is probably called while the PEs have
2625	 * not be created yet. For example, resource reassignment
2626	 * during PCI probe period. We just skip the check if
2627	 * PEs isn't ready.
2628	 */
2629	if (!phb->initialized)
2630		return true;
2631
2632	pdn = pci_get_pdn(dev);
2633	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2634		return false;
2635
2636	return true;
2637}
2638
2639static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
2640			       u32 devfn)
2641{
2642	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
2643}
2644
2645static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
2646{
2647	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
2648		       OPAL_ASSERT_RESET);
2649}
2650
2651static void __init pnv_pci_init_ioda_phb(struct device_node *np,
2652					 u64 hub_id, int ioda_type)
2653{
2654	struct pci_controller *hose;
2655	struct pnv_phb *phb;
2656	unsigned long size, m32map_off, pemap_off, iomap_off = 0;
2657	const __be64 *prop64;
2658	const __be32 *prop32;
2659	int len;
2660	u64 phb_id;
2661	void *aux;
2662	long rc;
2663
2664	pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
2665
2666	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
2667	if (!prop64) {
2668		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
2669		return;
2670	}
2671	phb_id = be64_to_cpup(prop64);
2672	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
2673
2674	phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
2675
2676	/* Allocate PCI controller */
2677	phb->hose = hose = pcibios_alloc_controller(np);
2678	if (!phb->hose) {
2679		pr_err("  Can't allocate PCI controller for %s\n",
2680		       np->full_name);
2681		memblock_free(__pa(phb), sizeof(struct pnv_phb));
2682		return;
2683	}
2684
2685	spin_lock_init(&phb->lock);
2686	prop32 = of_get_property(np, "bus-range", &len);
2687	if (prop32 && len == 8) {
2688		hose->first_busno = be32_to_cpu(prop32[0]);
2689		hose->last_busno = be32_to_cpu(prop32[1]);
2690	} else {
2691		pr_warn("  Broken <bus-range> on %s\n", np->full_name);
2692		hose->first_busno = 0;
2693		hose->last_busno = 0xff;
2694	}
2695	hose->private_data = phb;
2696	phb->hub_id = hub_id;
2697	phb->opal_id = phb_id;
2698	phb->type = ioda_type;
2699	mutex_init(&phb->ioda.pe_alloc_mutex);
2700
2701	/* Detect specific models for error handling */
2702	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
2703		phb->model = PNV_PHB_MODEL_P7IOC;
2704	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
2705		phb->model = PNV_PHB_MODEL_PHB3;
2706	else
2707		phb->model = PNV_PHB_MODEL_UNKNOWN;
2708
2709	/* Parse 32-bit and IO ranges (if any) */
2710	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
2711
2712	/* Get registers */
2713	phb->regs = of_iomap(np, 0);
2714	if (phb->regs == NULL)
2715		pr_err("  Failed to map registers !\n");
2716
2717	/* Initialize more IODA stuff */
2718	phb->ioda.total_pe = 1;
2719	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
2720	if (prop32)
2721		phb->ioda.total_pe = be32_to_cpup(prop32);
2722	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
2723	if (prop32)
2724		phb->ioda.reserved_pe = be32_to_cpup(prop32);
2725
2726	/* Parse 64-bit MMIO range */
2727	pnv_ioda_parse_m64_window(phb);
2728
2729	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
2730	/* FW Has already off top 64k of M32 space (MSI space) */
2731	phb->ioda.m32_size += 0x10000;
2732
2733	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
2734	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
2735	phb->ioda.io_size = hose->pci_io_size;
2736	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
2737	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
2738
2739	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
2740	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
2741	m32map_off = size;
2742	size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
2743	if (phb->type == PNV_PHB_IODA1) {
2744		iomap_off = size;
2745		size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
2746	}
2747	pemap_off = size;
2748	size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
2749	aux = memblock_virt_alloc(size, 0);
2750	phb->ioda.pe_alloc = aux;
2751	phb->ioda.m32_segmap = aux + m32map_off;
2752	if (phb->type == PNV_PHB_IODA1)
2753		phb->ioda.io_segmap = aux + iomap_off;
2754	phb->ioda.pe_array = aux + pemap_off;
2755	set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
2756
2757	INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
2758	INIT_LIST_HEAD(&phb->ioda.pe_list);
2759	mutex_init(&phb->ioda.pe_list_mutex);
2760
2761	/* Calculate how many 32-bit TCE segments we have */
2762	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
2763
2764#if 0 /* We should really do that ... */
2765	rc = opal_pci_set_phb_mem_window(opal->phb_id,
2766					 window_type,
2767					 window_num,
2768					 starting_real_address,
2769					 starting_pci_address,
2770					 segment_size);
2771#endif
2772
2773	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
2774		phb->ioda.total_pe, phb->ioda.reserved_pe,
2775		phb->ioda.m32_size, phb->ioda.m32_segsize);
2776	if (phb->ioda.m64_size)
2777		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
2778			phb->ioda.m64_size, phb->ioda.m64_segsize);
2779	if (phb->ioda.io_size)
2780		pr_info("                  IO: 0x%x [segment=0x%x]\n",
2781			phb->ioda.io_size, phb->ioda.io_segsize);
2782
2783
2784	phb->hose->ops = &pnv_pci_ops;
2785	phb->get_pe_state = pnv_ioda_get_pe_state;
2786	phb->freeze_pe = pnv_ioda_freeze_pe;
2787	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
2788
2789	/* Setup RID -> PE mapping function */
2790	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
2791
2792	/* Setup TCEs */
2793	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
2794	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
2795	phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask;
2796
2797	/* Setup shutdown function for kexec */
2798	phb->shutdown = pnv_pci_ioda_shutdown;
2799
2800	/* Setup MSI support */
2801	pnv_pci_init_ioda_msis(phb);
2802
2803	/*
2804	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
2805	 * to let the PCI core do resource assignment. It's supposed
2806	 * that the PCI core will do correct I/O and MMIO alignment
2807	 * for the P2P bridge bars so that each PCI bus (excluding
2808	 * the child P2P bridges) can form individual PE.
2809	 */
2810	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
2811	pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
2812	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
2813	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
2814	hose->controller_ops = pnv_pci_controller_ops;
2815
2816#ifdef CONFIG_PCI_IOV
2817	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
2818	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
2819#endif
2820
2821	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
2822
2823	/* Reset IODA tables to a clean state */
2824	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
2825	if (rc)
2826		pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
2827
2828	/* If we're running in kdump kerenl, the previous kerenl never
2829	 * shutdown PCI devices correctly. We already got IODA table
2830	 * cleaned out. So we have to issue PHB reset to stop all PCI
2831	 * transactions from previous kerenl.
2832	 */
2833	if (is_kdump_kernel()) {
2834		pr_info("  Issue PHB reset ...\n");
2835		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
2836		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
2837	}
2838
2839	/* Remove M64 resource if we can't configure it successfully */
2840	if (!phb->init_m64 || phb->init_m64(phb))
2841		hose->mem_resources[1].flags = 0;
2842}
2843
2844void __init pnv_pci_init_ioda2_phb(struct device_node *np)
2845{
2846	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
2847}
2848
2849void __init pnv_pci_init_ioda_hub(struct device_node *np)
2850{
2851	struct device_node *phbn;
2852	const __be64 *prop64;
2853	u64 hub_id;
2854
2855	pr_info("Probing IODA IO-Hub %s\n", np->full_name);
2856
2857	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
2858	if (!prop64) {
2859		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
2860		return;
2861	}
2862	hub_id = be64_to_cpup(prop64);
2863	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
2864
2865	/* Count child PHBs */
2866	for_each_child_of_node(np, phbn) {
2867		/* Look for IODA1 PHBs */
2868		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
2869			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
2870	}
2871}
2872