1/* 2 * PowerPC64 port by Mike Corrigan and Dave Engebretsen 3 * {mikejc|engebret}@us.ibm.com 4 * 5 * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> 6 * 7 * SMP scalability work: 8 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM 9 * 10 * Module name: htab.c 11 * 12 * Description: 13 * PowerPC Hashed Page Table functions 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 */ 20 21#undef DEBUG 22#undef DEBUG_LOW 23 24#include <linux/spinlock.h> 25#include <linux/errno.h> 26#include <linux/sched.h> 27#include <linux/proc_fs.h> 28#include <linux/stat.h> 29#include <linux/sysctl.h> 30#include <linux/export.h> 31#include <linux/ctype.h> 32#include <linux/cache.h> 33#include <linux/init.h> 34#include <linux/signal.h> 35#include <linux/memblock.h> 36#include <linux/context_tracking.h> 37 38#include <asm/processor.h> 39#include <asm/pgtable.h> 40#include <asm/mmu.h> 41#include <asm/mmu_context.h> 42#include <asm/page.h> 43#include <asm/types.h> 44#include <asm/uaccess.h> 45#include <asm/machdep.h> 46#include <asm/prom.h> 47#include <asm/tlbflush.h> 48#include <asm/io.h> 49#include <asm/eeh.h> 50#include <asm/tlb.h> 51#include <asm/cacheflush.h> 52#include <asm/cputable.h> 53#include <asm/sections.h> 54#include <asm/copro.h> 55#include <asm/udbg.h> 56#include <asm/code-patching.h> 57#include <asm/fadump.h> 58#include <asm/firmware.h> 59#include <asm/tm.h> 60 61#ifdef DEBUG 62#define DBG(fmt...) udbg_printf(fmt) 63#else 64#define DBG(fmt...) 65#endif 66 67#ifdef DEBUG_LOW 68#define DBG_LOW(fmt...) udbg_printf(fmt) 69#else 70#define DBG_LOW(fmt...) 71#endif 72 73#define KB (1024) 74#define MB (1024*KB) 75#define GB (1024L*MB) 76 77/* 78 * Note: pte --> Linux PTE 79 * HPTE --> PowerPC Hashed Page Table Entry 80 * 81 * Execution context: 82 * htab_initialize is called with the MMU off (of course), but 83 * the kernel has been copied down to zero so it can directly 84 * reference global data. At this point it is very difficult 85 * to print debug info. 86 * 87 */ 88 89#ifdef CONFIG_U3_DART 90extern unsigned long dart_tablebase; 91#endif /* CONFIG_U3_DART */ 92 93static unsigned long _SDR1; 94struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 95EXPORT_SYMBOL_GPL(mmu_psize_defs); 96 97struct hash_pte *htab_address; 98unsigned long htab_size_bytes; 99unsigned long htab_hash_mask; 100EXPORT_SYMBOL_GPL(htab_hash_mask); 101int mmu_linear_psize = MMU_PAGE_4K; 102EXPORT_SYMBOL_GPL(mmu_linear_psize); 103int mmu_virtual_psize = MMU_PAGE_4K; 104int mmu_vmalloc_psize = MMU_PAGE_4K; 105#ifdef CONFIG_SPARSEMEM_VMEMMAP 106int mmu_vmemmap_psize = MMU_PAGE_4K; 107#endif 108int mmu_io_psize = MMU_PAGE_4K; 109int mmu_kernel_ssize = MMU_SEGSIZE_256M; 110EXPORT_SYMBOL_GPL(mmu_kernel_ssize); 111int mmu_highuser_ssize = MMU_SEGSIZE_256M; 112u16 mmu_slb_size = 64; 113EXPORT_SYMBOL_GPL(mmu_slb_size); 114#ifdef CONFIG_PPC_64K_PAGES 115int mmu_ci_restrictions; 116#endif 117#ifdef CONFIG_DEBUG_PAGEALLOC 118static u8 *linear_map_hash_slots; 119static unsigned long linear_map_hash_count; 120static DEFINE_SPINLOCK(linear_map_hash_lock); 121#endif /* CONFIG_DEBUG_PAGEALLOC */ 122 123/* There are definitions of page sizes arrays to be used when none 124 * is provided by the firmware. 125 */ 126 127/* Pre-POWER4 CPUs (4k pages only) 128 */ 129static struct mmu_psize_def mmu_psize_defaults_old[] = { 130 [MMU_PAGE_4K] = { 131 .shift = 12, 132 .sllp = 0, 133 .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, 134 .avpnm = 0, 135 .tlbiel = 0, 136 }, 137}; 138 139/* POWER4, GPUL, POWER5 140 * 141 * Support for 16Mb large pages 142 */ 143static struct mmu_psize_def mmu_psize_defaults_gp[] = { 144 [MMU_PAGE_4K] = { 145 .shift = 12, 146 .sllp = 0, 147 .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, 148 .avpnm = 0, 149 .tlbiel = 1, 150 }, 151 [MMU_PAGE_16M] = { 152 .shift = 24, 153 .sllp = SLB_VSID_L, 154 .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, 155 [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, 156 .avpnm = 0x1UL, 157 .tlbiel = 0, 158 }, 159}; 160 161static unsigned long htab_convert_pte_flags(unsigned long pteflags) 162{ 163 unsigned long rflags = pteflags & 0x1fa; 164 165 /* _PAGE_EXEC -> NOEXEC */ 166 if ((pteflags & _PAGE_EXEC) == 0) 167 rflags |= HPTE_R_N; 168 169 /* PP bits. PAGE_USER is already PP bit 0x2, so we only 170 * need to add in 0x1 if it's a read-only user page 171 */ 172 if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && 173 (pteflags & _PAGE_DIRTY))) 174 rflags |= 1; 175 /* 176 * Always add "C" bit for perf. Memory coherence is always enabled 177 */ 178 return rflags | HPTE_R_C | HPTE_R_M; 179} 180 181int htab_bolt_mapping(unsigned long vstart, unsigned long vend, 182 unsigned long pstart, unsigned long prot, 183 int psize, int ssize) 184{ 185 unsigned long vaddr, paddr; 186 unsigned int step, shift; 187 int ret = 0; 188 189 shift = mmu_psize_defs[psize].shift; 190 step = 1 << shift; 191 192 prot = htab_convert_pte_flags(prot); 193 194 DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", 195 vstart, vend, pstart, prot, psize, ssize); 196 197 for (vaddr = vstart, paddr = pstart; vaddr < vend; 198 vaddr += step, paddr += step) { 199 unsigned long hash, hpteg; 200 unsigned long vsid = get_kernel_vsid(vaddr, ssize); 201 unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); 202 unsigned long tprot = prot; 203 204 /* 205 * If we hit a bad address return error. 206 */ 207 if (!vsid) 208 return -1; 209 /* Make kernel text executable */ 210 if (overlaps_kernel_text(vaddr, vaddr + step)) 211 tprot &= ~HPTE_R_N; 212 213 /* Make kvm guest trampolines executable */ 214 if (overlaps_kvm_tmp(vaddr, vaddr + step)) 215 tprot &= ~HPTE_R_N; 216 217 /* 218 * If relocatable, check if it overlaps interrupt vectors that 219 * are copied down to real 0. For relocatable kernel 220 * (e.g. kdump case) we copy interrupt vectors down to real 221 * address 0. Mark that region as executable. This is 222 * because on p8 system with relocation on exception feature 223 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence 224 * in order to execute the interrupt handlers in virtual 225 * mode the vector region need to be marked as executable. 226 */ 227 if ((PHYSICAL_START > MEMORY_START) && 228 overlaps_interrupt_vector_text(vaddr, vaddr + step)) 229 tprot &= ~HPTE_R_N; 230 231 hash = hpt_hash(vpn, shift, ssize); 232 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); 233 234 BUG_ON(!ppc_md.hpte_insert); 235 ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot, 236 HPTE_V_BOLTED, psize, psize, ssize); 237 238 if (ret < 0) 239 break; 240#ifdef CONFIG_DEBUG_PAGEALLOC 241 if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) 242 linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; 243#endif /* CONFIG_DEBUG_PAGEALLOC */ 244 } 245 return ret < 0 ? ret : 0; 246} 247 248#ifdef CONFIG_MEMORY_HOTPLUG 249int htab_remove_mapping(unsigned long vstart, unsigned long vend, 250 int psize, int ssize) 251{ 252 unsigned long vaddr; 253 unsigned int step, shift; 254 255 shift = mmu_psize_defs[psize].shift; 256 step = 1 << shift; 257 258 if (!ppc_md.hpte_removebolted) { 259 printk(KERN_WARNING "Platform doesn't implement " 260 "hpte_removebolted\n"); 261 return -EINVAL; 262 } 263 264 for (vaddr = vstart; vaddr < vend; vaddr += step) 265 ppc_md.hpte_removebolted(vaddr, psize, ssize); 266 267 return 0; 268} 269#endif /* CONFIG_MEMORY_HOTPLUG */ 270 271static int __init htab_dt_scan_seg_sizes(unsigned long node, 272 const char *uname, int depth, 273 void *data) 274{ 275 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 276 const __be32 *prop; 277 int size = 0; 278 279 /* We are scanning "cpu" nodes only */ 280 if (type == NULL || strcmp(type, "cpu") != 0) 281 return 0; 282 283 prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); 284 if (prop == NULL) 285 return 0; 286 for (; size >= 4; size -= 4, ++prop) { 287 if (be32_to_cpu(prop[0]) == 40) { 288 DBG("1T segment support detected\n"); 289 cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; 290 return 1; 291 } 292 } 293 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 294 return 0; 295} 296 297static void __init htab_init_seg_sizes(void) 298{ 299 of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); 300} 301 302static int __init get_idx_from_shift(unsigned int shift) 303{ 304 int idx = -1; 305 306 switch (shift) { 307 case 0xc: 308 idx = MMU_PAGE_4K; 309 break; 310 case 0x10: 311 idx = MMU_PAGE_64K; 312 break; 313 case 0x14: 314 idx = MMU_PAGE_1M; 315 break; 316 case 0x18: 317 idx = MMU_PAGE_16M; 318 break; 319 case 0x22: 320 idx = MMU_PAGE_16G; 321 break; 322 } 323 return idx; 324} 325 326static int __init htab_dt_scan_page_sizes(unsigned long node, 327 const char *uname, int depth, 328 void *data) 329{ 330 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 331 const __be32 *prop; 332 int size = 0; 333 334 /* We are scanning "cpu" nodes only */ 335 if (type == NULL || strcmp(type, "cpu") != 0) 336 return 0; 337 338 prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); 339 if (!prop) 340 return 0; 341 342 pr_info("Page sizes from device-tree:\n"); 343 size /= 4; 344 cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); 345 while(size > 0) { 346 unsigned int base_shift = be32_to_cpu(prop[0]); 347 unsigned int slbenc = be32_to_cpu(prop[1]); 348 unsigned int lpnum = be32_to_cpu(prop[2]); 349 struct mmu_psize_def *def; 350 int idx, base_idx; 351 352 size -= 3; prop += 3; 353 base_idx = get_idx_from_shift(base_shift); 354 if (base_idx < 0) { 355 /* skip the pte encoding also */ 356 prop += lpnum * 2; size -= lpnum * 2; 357 continue; 358 } 359 def = &mmu_psize_defs[base_idx]; 360 if (base_idx == MMU_PAGE_16M) 361 cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; 362 363 def->shift = base_shift; 364 if (base_shift <= 23) 365 def->avpnm = 0; 366 else 367 def->avpnm = (1 << (base_shift - 23)) - 1; 368 def->sllp = slbenc; 369 /* 370 * We don't know for sure what's up with tlbiel, so 371 * for now we only set it for 4K and 64K pages 372 */ 373 if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) 374 def->tlbiel = 1; 375 else 376 def->tlbiel = 0; 377 378 while (size > 0 && lpnum) { 379 unsigned int shift = be32_to_cpu(prop[0]); 380 int penc = be32_to_cpu(prop[1]); 381 382 prop += 2; size -= 2; 383 lpnum--; 384 385 idx = get_idx_from_shift(shift); 386 if (idx < 0) 387 continue; 388 389 if (penc == -1) 390 pr_err("Invalid penc for base_shift=%d " 391 "shift=%d\n", base_shift, shift); 392 393 def->penc[idx] = penc; 394 pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," 395 " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", 396 base_shift, shift, def->sllp, 397 def->avpnm, def->tlbiel, def->penc[idx]); 398 } 399 } 400 401 return 1; 402} 403 404#ifdef CONFIG_HUGETLB_PAGE 405/* Scan for 16G memory blocks that have been set aside for huge pages 406 * and reserve those blocks for 16G huge pages. 407 */ 408static int __init htab_dt_scan_hugepage_blocks(unsigned long node, 409 const char *uname, int depth, 410 void *data) { 411 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 412 const __be64 *addr_prop; 413 const __be32 *page_count_prop; 414 unsigned int expected_pages; 415 long unsigned int phys_addr; 416 long unsigned int block_size; 417 418 /* We are scanning "memory" nodes only */ 419 if (type == NULL || strcmp(type, "memory") != 0) 420 return 0; 421 422 /* This property is the log base 2 of the number of virtual pages that 423 * will represent this memory block. */ 424 page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); 425 if (page_count_prop == NULL) 426 return 0; 427 expected_pages = (1 << be32_to_cpu(page_count_prop[0])); 428 addr_prop = of_get_flat_dt_prop(node, "reg", NULL); 429 if (addr_prop == NULL) 430 return 0; 431 phys_addr = be64_to_cpu(addr_prop[0]); 432 block_size = be64_to_cpu(addr_prop[1]); 433 if (block_size != (16 * GB)) 434 return 0; 435 printk(KERN_INFO "Huge page(16GB) memory: " 436 "addr = 0x%lX size = 0x%lX pages = %d\n", 437 phys_addr, block_size, expected_pages); 438 if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { 439 memblock_reserve(phys_addr, block_size * expected_pages); 440 add_gpage(phys_addr, block_size, expected_pages); 441 } 442 return 0; 443} 444#endif /* CONFIG_HUGETLB_PAGE */ 445 446static void mmu_psize_set_default_penc(void) 447{ 448 int bpsize, apsize; 449 for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) 450 for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) 451 mmu_psize_defs[bpsize].penc[apsize] = -1; 452} 453 454#ifdef CONFIG_PPC_64K_PAGES 455 456static bool might_have_hea(void) 457{ 458 /* 459 * The HEA ethernet adapter requires awareness of the 460 * GX bus. Without that awareness we can easily assume 461 * we will never see an HEA ethernet device. 462 */ 463#ifdef CONFIG_IBMEBUS 464 return !cpu_has_feature(CPU_FTR_ARCH_207S); 465#else 466 return false; 467#endif 468} 469 470#endif /* #ifdef CONFIG_PPC_64K_PAGES */ 471 472static void __init htab_init_page_sizes(void) 473{ 474 int rc; 475 476 /* se the invalid penc to -1 */ 477 mmu_psize_set_default_penc(); 478 479 /* Default to 4K pages only */ 480 memcpy(mmu_psize_defs, mmu_psize_defaults_old, 481 sizeof(mmu_psize_defaults_old)); 482 483 /* 484 * Try to find the available page sizes in the device-tree 485 */ 486 rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); 487 if (rc != 0) /* Found */ 488 goto found; 489 490 /* 491 * Not in the device-tree, let's fallback on known size 492 * list for 16M capable GP & GR 493 */ 494 if (mmu_has_feature(MMU_FTR_16M_PAGE)) 495 memcpy(mmu_psize_defs, mmu_psize_defaults_gp, 496 sizeof(mmu_psize_defaults_gp)); 497 found: 498#ifndef CONFIG_DEBUG_PAGEALLOC 499 /* 500 * Pick a size for the linear mapping. Currently, we only support 501 * 16M, 1M and 4K which is the default 502 */ 503 if (mmu_psize_defs[MMU_PAGE_16M].shift) 504 mmu_linear_psize = MMU_PAGE_16M; 505 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 506 mmu_linear_psize = MMU_PAGE_1M; 507#endif /* CONFIG_DEBUG_PAGEALLOC */ 508 509#ifdef CONFIG_PPC_64K_PAGES 510 /* 511 * Pick a size for the ordinary pages. Default is 4K, we support 512 * 64K for user mappings and vmalloc if supported by the processor. 513 * We only use 64k for ioremap if the processor 514 * (and firmware) support cache-inhibited large pages. 515 * If not, we use 4k and set mmu_ci_restrictions so that 516 * hash_page knows to switch processes that use cache-inhibited 517 * mappings to 4k pages. 518 */ 519 if (mmu_psize_defs[MMU_PAGE_64K].shift) { 520 mmu_virtual_psize = MMU_PAGE_64K; 521 mmu_vmalloc_psize = MMU_PAGE_64K; 522 if (mmu_linear_psize == MMU_PAGE_4K) 523 mmu_linear_psize = MMU_PAGE_64K; 524 if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { 525 /* 526 * When running on pSeries using 64k pages for ioremap 527 * would stop us accessing the HEA ethernet. So if we 528 * have the chance of ever seeing one, stay at 4k. 529 */ 530 if (!might_have_hea() || !machine_is(pseries)) 531 mmu_io_psize = MMU_PAGE_64K; 532 } else 533 mmu_ci_restrictions = 1; 534 } 535#endif /* CONFIG_PPC_64K_PAGES */ 536 537#ifdef CONFIG_SPARSEMEM_VMEMMAP 538 /* We try to use 16M pages for vmemmap if that is supported 539 * and we have at least 1G of RAM at boot 540 */ 541 if (mmu_psize_defs[MMU_PAGE_16M].shift && 542 memblock_phys_mem_size() >= 0x40000000) 543 mmu_vmemmap_psize = MMU_PAGE_16M; 544 else if (mmu_psize_defs[MMU_PAGE_64K].shift) 545 mmu_vmemmap_psize = MMU_PAGE_64K; 546 else 547 mmu_vmemmap_psize = MMU_PAGE_4K; 548#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 549 550 printk(KERN_DEBUG "Page orders: linear mapping = %d, " 551 "virtual = %d, io = %d" 552#ifdef CONFIG_SPARSEMEM_VMEMMAP 553 ", vmemmap = %d" 554#endif 555 "\n", 556 mmu_psize_defs[mmu_linear_psize].shift, 557 mmu_psize_defs[mmu_virtual_psize].shift, 558 mmu_psize_defs[mmu_io_psize].shift 559#ifdef CONFIG_SPARSEMEM_VMEMMAP 560 ,mmu_psize_defs[mmu_vmemmap_psize].shift 561#endif 562 ); 563 564#ifdef CONFIG_HUGETLB_PAGE 565 /* Reserve 16G huge page memory sections for huge pages */ 566 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); 567#endif /* CONFIG_HUGETLB_PAGE */ 568} 569 570static int __init htab_dt_scan_pftsize(unsigned long node, 571 const char *uname, int depth, 572 void *data) 573{ 574 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 575 const __be32 *prop; 576 577 /* We are scanning "cpu" nodes only */ 578 if (type == NULL || strcmp(type, "cpu") != 0) 579 return 0; 580 581 prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); 582 if (prop != NULL) { 583 /* pft_size[0] is the NUMA CEC cookie */ 584 ppc64_pft_size = be32_to_cpu(prop[1]); 585 return 1; 586 } 587 return 0; 588} 589 590static unsigned long __init htab_get_table_size(void) 591{ 592 unsigned long mem_size, rnd_mem_size, pteg_count, psize; 593 594 /* If hash size isn't already provided by the platform, we try to 595 * retrieve it from the device-tree. If it's not there neither, we 596 * calculate it now based on the total RAM size 597 */ 598 if (ppc64_pft_size == 0) 599 of_scan_flat_dt(htab_dt_scan_pftsize, NULL); 600 if (ppc64_pft_size) 601 return 1UL << ppc64_pft_size; 602 603 /* round mem_size up to next power of 2 */ 604 mem_size = memblock_phys_mem_size(); 605 rnd_mem_size = 1UL << __ilog2(mem_size); 606 if (rnd_mem_size < mem_size) 607 rnd_mem_size <<= 1; 608 609 /* # pages / 2 */ 610 psize = mmu_psize_defs[mmu_virtual_psize].shift; 611 pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11); 612 613 return pteg_count << 7; 614} 615 616#ifdef CONFIG_MEMORY_HOTPLUG 617int create_section_mapping(unsigned long start, unsigned long end) 618{ 619 return htab_bolt_mapping(start, end, __pa(start), 620 pgprot_val(PAGE_KERNEL), mmu_linear_psize, 621 mmu_kernel_ssize); 622} 623 624int remove_section_mapping(unsigned long start, unsigned long end) 625{ 626 return htab_remove_mapping(start, end, mmu_linear_psize, 627 mmu_kernel_ssize); 628} 629#endif /* CONFIG_MEMORY_HOTPLUG */ 630 631extern u32 htab_call_hpte_insert1[]; 632extern u32 htab_call_hpte_insert2[]; 633extern u32 htab_call_hpte_remove[]; 634extern u32 htab_call_hpte_updatepp[]; 635extern u32 ht64_call_hpte_insert1[]; 636extern u32 ht64_call_hpte_insert2[]; 637extern u32 ht64_call_hpte_remove[]; 638extern u32 ht64_call_hpte_updatepp[]; 639 640static void __init htab_finish_init(void) 641{ 642#ifdef CONFIG_PPC_HAS_HASH_64K 643 patch_branch(ht64_call_hpte_insert1, 644 ppc_function_entry(ppc_md.hpte_insert), 645 BRANCH_SET_LINK); 646 patch_branch(ht64_call_hpte_insert2, 647 ppc_function_entry(ppc_md.hpte_insert), 648 BRANCH_SET_LINK); 649 patch_branch(ht64_call_hpte_remove, 650 ppc_function_entry(ppc_md.hpte_remove), 651 BRANCH_SET_LINK); 652 patch_branch(ht64_call_hpte_updatepp, 653 ppc_function_entry(ppc_md.hpte_updatepp), 654 BRANCH_SET_LINK); 655#endif /* CONFIG_PPC_HAS_HASH_64K */ 656 657 patch_branch(htab_call_hpte_insert1, 658 ppc_function_entry(ppc_md.hpte_insert), 659 BRANCH_SET_LINK); 660 patch_branch(htab_call_hpte_insert2, 661 ppc_function_entry(ppc_md.hpte_insert), 662 BRANCH_SET_LINK); 663 patch_branch(htab_call_hpte_remove, 664 ppc_function_entry(ppc_md.hpte_remove), 665 BRANCH_SET_LINK); 666 patch_branch(htab_call_hpte_updatepp, 667 ppc_function_entry(ppc_md.hpte_updatepp), 668 BRANCH_SET_LINK); 669} 670 671static void __init htab_initialize(void) 672{ 673 unsigned long table; 674 unsigned long pteg_count; 675 unsigned long prot; 676 unsigned long base = 0, size = 0, limit; 677 struct memblock_region *reg; 678 679 DBG(" -> htab_initialize()\n"); 680 681 /* Initialize segment sizes */ 682 htab_init_seg_sizes(); 683 684 /* Initialize page sizes */ 685 htab_init_page_sizes(); 686 687 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { 688 mmu_kernel_ssize = MMU_SEGSIZE_1T; 689 mmu_highuser_ssize = MMU_SEGSIZE_1T; 690 printk(KERN_INFO "Using 1TB segments\n"); 691 } 692 693 /* 694 * Calculate the required size of the htab. We want the number of 695 * PTEGs to equal one half the number of real pages. 696 */ 697 htab_size_bytes = htab_get_table_size(); 698 pteg_count = htab_size_bytes >> 7; 699 700 htab_hash_mask = pteg_count - 1; 701 702 if (firmware_has_feature(FW_FEATURE_LPAR)) { 703 /* Using a hypervisor which owns the htab */ 704 htab_address = NULL; 705 _SDR1 = 0; 706#ifdef CONFIG_FA_DUMP 707 /* 708 * If firmware assisted dump is active firmware preserves 709 * the contents of htab along with entire partition memory. 710 * Clear the htab if firmware assisted dump is active so 711 * that we dont end up using old mappings. 712 */ 713 if (is_fadump_active() && ppc_md.hpte_clear_all) 714 ppc_md.hpte_clear_all(); 715#endif 716 } else { 717 /* Find storage for the HPT. Must be contiguous in 718 * the absolute address space. On cell we want it to be 719 * in the first 2 Gig so we can use it for IOMMU hacks. 720 */ 721 if (machine_is(cell)) 722 limit = 0x80000000; 723 else 724 limit = MEMBLOCK_ALLOC_ANYWHERE; 725 726 table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit); 727 728 DBG("Hash table allocated at %lx, size: %lx\n", table, 729 htab_size_bytes); 730 731 htab_address = __va(table); 732 733 /* htab absolute addr + encoded htabsize */ 734 _SDR1 = table + __ilog2(pteg_count) - 11; 735 736 /* Initialize the HPT with no entries */ 737 memset((void *)table, 0, htab_size_bytes); 738 739 /* Set SDR1 */ 740 mtspr(SPRN_SDR1, _SDR1); 741 } 742 743 prot = pgprot_val(PAGE_KERNEL); 744 745#ifdef CONFIG_DEBUG_PAGEALLOC 746 linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; 747 linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count, 748 1, ppc64_rma_size)); 749 memset(linear_map_hash_slots, 0, linear_map_hash_count); 750#endif /* CONFIG_DEBUG_PAGEALLOC */ 751 752 /* On U3 based machines, we need to reserve the DART area and 753 * _NOT_ map it to avoid cache paradoxes as it's remapped non 754 * cacheable later on 755 */ 756 757 /* create bolted the linear mapping in the hash table */ 758 for_each_memblock(memory, reg) { 759 base = (unsigned long)__va(reg->base); 760 size = reg->size; 761 762 DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", 763 base, size, prot); 764 765#ifdef CONFIG_U3_DART 766 /* Do not map the DART space. Fortunately, it will be aligned 767 * in such a way that it will not cross two memblock regions and 768 * will fit within a single 16Mb page. 769 * The DART space is assumed to be a full 16Mb region even if 770 * we only use 2Mb of that space. We will use more of it later 771 * for AGP GART. We have to use a full 16Mb large page. 772 */ 773 DBG("DART base: %lx\n", dart_tablebase); 774 775 if (dart_tablebase != 0 && dart_tablebase >= base 776 && dart_tablebase < (base + size)) { 777 unsigned long dart_table_end = dart_tablebase + 16 * MB; 778 if (base != dart_tablebase) 779 BUG_ON(htab_bolt_mapping(base, dart_tablebase, 780 __pa(base), prot, 781 mmu_linear_psize, 782 mmu_kernel_ssize)); 783 if ((base + size) > dart_table_end) 784 BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, 785 base + size, 786 __pa(dart_table_end), 787 prot, 788 mmu_linear_psize, 789 mmu_kernel_ssize)); 790 continue; 791 } 792#endif /* CONFIG_U3_DART */ 793 BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), 794 prot, mmu_linear_psize, mmu_kernel_ssize)); 795 } 796 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 797 798 /* 799 * If we have a memory_limit and we've allocated TCEs then we need to 800 * explicitly map the TCE area at the top of RAM. We also cope with the 801 * case that the TCEs start below memory_limit. 802 * tce_alloc_start/end are 16MB aligned so the mapping should work 803 * for either 4K or 16MB pages. 804 */ 805 if (tce_alloc_start) { 806 tce_alloc_start = (unsigned long)__va(tce_alloc_start); 807 tce_alloc_end = (unsigned long)__va(tce_alloc_end); 808 809 if (base + size >= tce_alloc_start) 810 tce_alloc_start = base + size + 1; 811 812 BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, 813 __pa(tce_alloc_start), prot, 814 mmu_linear_psize, mmu_kernel_ssize)); 815 } 816 817 htab_finish_init(); 818 819 DBG(" <- htab_initialize()\n"); 820} 821#undef KB 822#undef MB 823 824void __init early_init_mmu(void) 825{ 826 /* Initialize the MMU Hash table and create the linear mapping 827 * of memory. Has to be done before SLB initialization as this is 828 * currently where the page size encoding is obtained. 829 */ 830 htab_initialize(); 831 832 /* Initialize SLB management */ 833 slb_initialize(); 834} 835 836#ifdef CONFIG_SMP 837void early_init_mmu_secondary(void) 838{ 839 /* Initialize hash table for that CPU */ 840 if (!firmware_has_feature(FW_FEATURE_LPAR)) 841 mtspr(SPRN_SDR1, _SDR1); 842 843 /* Initialize SLB */ 844 slb_initialize(); 845} 846#endif /* CONFIG_SMP */ 847 848/* 849 * Called by asm hashtable.S for doing lazy icache flush 850 */ 851unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) 852{ 853 struct page *page; 854 855 if (!pfn_valid(pte_pfn(pte))) 856 return pp; 857 858 page = pte_page(pte); 859 860 /* page is dirty */ 861 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 862 if (trap == 0x400) { 863 flush_dcache_icache_page(page); 864 set_bit(PG_arch_1, &page->flags); 865 } else 866 pp |= HPTE_R_N; 867 } 868 return pp; 869} 870 871#ifdef CONFIG_PPC_MM_SLICES 872static unsigned int get_paca_psize(unsigned long addr) 873{ 874 u64 lpsizes; 875 unsigned char *hpsizes; 876 unsigned long index, mask_index; 877 878 if (addr < SLICE_LOW_TOP) { 879 lpsizes = get_paca()->context.low_slices_psize; 880 index = GET_LOW_SLICE_INDEX(addr); 881 return (lpsizes >> (index * 4)) & 0xF; 882 } 883 hpsizes = get_paca()->context.high_slices_psize; 884 index = GET_HIGH_SLICE_INDEX(addr); 885 mask_index = index & 0x1; 886 return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; 887} 888 889#else 890unsigned int get_paca_psize(unsigned long addr) 891{ 892 return get_paca()->context.user_psize; 893} 894#endif 895 896/* 897 * Demote a segment to using 4k pages. 898 * For now this makes the whole process use 4k pages. 899 */ 900#ifdef CONFIG_PPC_64K_PAGES 901void demote_segment_4k(struct mm_struct *mm, unsigned long addr) 902{ 903 if (get_slice_psize(mm, addr) == MMU_PAGE_4K) 904 return; 905 slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); 906 copro_flush_all_slbs(mm); 907 if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { 908 get_paca()->context = mm->context; 909 slb_flush_and_rebolt(); 910 } 911} 912#endif /* CONFIG_PPC_64K_PAGES */ 913 914#ifdef CONFIG_PPC_SUBPAGE_PROT 915/* 916 * This looks up a 2-bit protection code for a 4k subpage of a 64k page. 917 * Userspace sets the subpage permissions using the subpage_prot system call. 918 * 919 * Result is 0: full permissions, _PAGE_RW: read-only, 920 * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. 921 */ 922static int subpage_protection(struct mm_struct *mm, unsigned long ea) 923{ 924 struct subpage_prot_table *spt = &mm->context.spt; 925 u32 spp = 0; 926 u32 **sbpm, *sbpp; 927 928 if (ea >= spt->maxaddr) 929 return 0; 930 if (ea < 0x100000000UL) { 931 /* addresses below 4GB use spt->low_prot */ 932 sbpm = spt->low_prot; 933 } else { 934 sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; 935 if (!sbpm) 936 return 0; 937 } 938 sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; 939 if (!sbpp) 940 return 0; 941 spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; 942 943 /* extract 2-bit bitfield for this 4k subpage */ 944 spp >>= 30 - 2 * ((ea >> 12) & 0xf); 945 946 /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ 947 spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); 948 return spp; 949} 950 951#else /* CONFIG_PPC_SUBPAGE_PROT */ 952static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) 953{ 954 return 0; 955} 956#endif 957 958void hash_failure_debug(unsigned long ea, unsigned long access, 959 unsigned long vsid, unsigned long trap, 960 int ssize, int psize, int lpsize, unsigned long pte) 961{ 962 if (!printk_ratelimit()) 963 return; 964 pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", 965 ea, access, current->comm); 966 pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", 967 trap, vsid, ssize, psize, lpsize, pte); 968} 969 970static void check_paca_psize(unsigned long ea, struct mm_struct *mm, 971 int psize, bool user_region) 972{ 973 if (user_region) { 974 if (psize != get_paca_psize(ea)) { 975 get_paca()->context = mm->context; 976 slb_flush_and_rebolt(); 977 } 978 } else if (get_paca()->vmalloc_sllp != 979 mmu_psize_defs[mmu_vmalloc_psize].sllp) { 980 get_paca()->vmalloc_sllp = 981 mmu_psize_defs[mmu_vmalloc_psize].sllp; 982 slb_vmalloc_update(); 983 } 984} 985 986/* Result code is: 987 * 0 - handled 988 * 1 - normal page fault 989 * -1 - critical hash insertion error 990 * -2 - access not permitted by subpage protection mechanism 991 */ 992int hash_page_mm(struct mm_struct *mm, unsigned long ea, 993 unsigned long access, unsigned long trap, 994 unsigned long flags) 995{ 996 enum ctx_state prev_state = exception_enter(); 997 pgd_t *pgdir; 998 unsigned long vsid; 999 pte_t *ptep; 1000 unsigned hugeshift; 1001 const struct cpumask *tmp; 1002 int rc, user_region = 0; 1003 int psize, ssize; 1004 1005 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", 1006 ea, access, trap); 1007 1008 /* Get region & vsid */ 1009 switch (REGION_ID(ea)) { 1010 case USER_REGION_ID: 1011 user_region = 1; 1012 if (! mm) { 1013 DBG_LOW(" user region with no mm !\n"); 1014 rc = 1; 1015 goto bail; 1016 } 1017 psize = get_slice_psize(mm, ea); 1018 ssize = user_segment_size(ea); 1019 vsid = get_vsid(mm->context.id, ea, ssize); 1020 break; 1021 case VMALLOC_REGION_ID: 1022 vsid = get_kernel_vsid(ea, mmu_kernel_ssize); 1023 if (ea < VMALLOC_END) 1024 psize = mmu_vmalloc_psize; 1025 else 1026 psize = mmu_io_psize; 1027 ssize = mmu_kernel_ssize; 1028 break; 1029 default: 1030 /* Not a valid range 1031 * Send the problem up to do_page_fault 1032 */ 1033 rc = 1; 1034 goto bail; 1035 } 1036 DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); 1037 1038 /* Bad address. */ 1039 if (!vsid) { 1040 DBG_LOW("Bad address!\n"); 1041 rc = 1; 1042 goto bail; 1043 } 1044 /* Get pgdir */ 1045 pgdir = mm->pgd; 1046 if (pgdir == NULL) { 1047 rc = 1; 1048 goto bail; 1049 } 1050 1051 /* Check CPU locality */ 1052 tmp = cpumask_of(smp_processor_id()); 1053 if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) 1054 flags |= HPTE_LOCAL_UPDATE; 1055 1056#ifndef CONFIG_PPC_64K_PAGES 1057 /* If we use 4K pages and our psize is not 4K, then we might 1058 * be hitting a special driver mapping, and need to align the 1059 * address before we fetch the PTE. 1060 * 1061 * It could also be a hugepage mapping, in which case this is 1062 * not necessary, but it's not harmful, either. 1063 */ 1064 if (psize != MMU_PAGE_4K) 1065 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 1066#endif /* CONFIG_PPC_64K_PAGES */ 1067 1068 /* Get PTE and page size from page tables */ 1069 ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); 1070 if (ptep == NULL || !pte_present(*ptep)) { 1071 DBG_LOW(" no PTE !\n"); 1072 rc = 1; 1073 goto bail; 1074 } 1075 1076 /* Add _PAGE_PRESENT to the required access perm */ 1077 access |= _PAGE_PRESENT; 1078 1079 /* Pre-check access permissions (will be re-checked atomically 1080 * in __hash_page_XX but this pre-check is a fast path 1081 */ 1082 if (access & ~pte_val(*ptep)) { 1083 DBG_LOW(" no access !\n"); 1084 rc = 1; 1085 goto bail; 1086 } 1087 1088 if (hugeshift) { 1089 if (pmd_trans_huge(*(pmd_t *)ptep)) 1090 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, 1091 trap, flags, ssize, psize); 1092#ifdef CONFIG_HUGETLB_PAGE 1093 else 1094 rc = __hash_page_huge(ea, access, vsid, ptep, trap, 1095 flags, ssize, hugeshift, psize); 1096#else 1097 else { 1098 /* 1099 * if we have hugeshift, and is not transhuge with 1100 * hugetlb disabled, something is really wrong. 1101 */ 1102 rc = 1; 1103 WARN_ON(1); 1104 } 1105#endif 1106 if (current->mm == mm) 1107 check_paca_psize(ea, mm, psize, user_region); 1108 1109 goto bail; 1110 } 1111 1112#ifndef CONFIG_PPC_64K_PAGES 1113 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 1114#else 1115 DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), 1116 pte_val(*(ptep + PTRS_PER_PTE))); 1117#endif 1118 /* Do actual hashing */ 1119#ifdef CONFIG_PPC_64K_PAGES 1120 /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ 1121 if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { 1122 demote_segment_4k(mm, ea); 1123 psize = MMU_PAGE_4K; 1124 } 1125 1126 /* If this PTE is non-cacheable and we have restrictions on 1127 * using non cacheable large pages, then we switch to 4k 1128 */ 1129 if (mmu_ci_restrictions && psize == MMU_PAGE_64K && 1130 (pte_val(*ptep) & _PAGE_NO_CACHE)) { 1131 if (user_region) { 1132 demote_segment_4k(mm, ea); 1133 psize = MMU_PAGE_4K; 1134 } else if (ea < VMALLOC_END) { 1135 /* 1136 * some driver did a non-cacheable mapping 1137 * in vmalloc space, so switch vmalloc 1138 * to 4k pages 1139 */ 1140 printk(KERN_ALERT "Reducing vmalloc segment " 1141 "to 4kB pages because of " 1142 "non-cacheable mapping\n"); 1143 psize = mmu_vmalloc_psize = MMU_PAGE_4K; 1144 copro_flush_all_slbs(mm); 1145 } 1146 } 1147 1148 if (current->mm == mm) 1149 check_paca_psize(ea, mm, psize, user_region); 1150#endif /* CONFIG_PPC_64K_PAGES */ 1151 1152#ifdef CONFIG_PPC_HAS_HASH_64K 1153 if (psize == MMU_PAGE_64K) 1154 rc = __hash_page_64K(ea, access, vsid, ptep, trap, 1155 flags, ssize); 1156 else 1157#endif /* CONFIG_PPC_HAS_HASH_64K */ 1158 { 1159 int spp = subpage_protection(mm, ea); 1160 if (access & spp) 1161 rc = -2; 1162 else 1163 rc = __hash_page_4K(ea, access, vsid, ptep, trap, 1164 flags, ssize, spp); 1165 } 1166 1167 /* Dump some info in case of hash insertion failure, they should 1168 * never happen so it is really useful to know if/when they do 1169 */ 1170 if (rc == -1) 1171 hash_failure_debug(ea, access, vsid, trap, ssize, psize, 1172 psize, pte_val(*ptep)); 1173#ifndef CONFIG_PPC_64K_PAGES 1174 DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); 1175#else 1176 DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), 1177 pte_val(*(ptep + PTRS_PER_PTE))); 1178#endif 1179 DBG_LOW(" -> rc=%d\n", rc); 1180 1181bail: 1182 exception_exit(prev_state); 1183 return rc; 1184} 1185EXPORT_SYMBOL_GPL(hash_page_mm); 1186 1187int hash_page(unsigned long ea, unsigned long access, unsigned long trap, 1188 unsigned long dsisr) 1189{ 1190 unsigned long flags = 0; 1191 struct mm_struct *mm = current->mm; 1192 1193 if (REGION_ID(ea) == VMALLOC_REGION_ID) 1194 mm = &init_mm; 1195 1196 if (dsisr & DSISR_NOHPTE) 1197 flags |= HPTE_NOHPTE_UPDATE; 1198 1199 return hash_page_mm(mm, ea, access, trap, flags); 1200} 1201EXPORT_SYMBOL_GPL(hash_page); 1202 1203void hash_preload(struct mm_struct *mm, unsigned long ea, 1204 unsigned long access, unsigned long trap) 1205{ 1206 int hugepage_shift; 1207 unsigned long vsid; 1208 pgd_t *pgdir; 1209 pte_t *ptep; 1210 unsigned long flags; 1211 int rc, ssize, update_flags = 0; 1212 1213 BUG_ON(REGION_ID(ea) != USER_REGION_ID); 1214 1215#ifdef CONFIG_PPC_MM_SLICES 1216 /* We only prefault standard pages for now */ 1217 if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) 1218 return; 1219#endif 1220 1221 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," 1222 " trap=%lx\n", mm, mm->pgd, ea, access, trap); 1223 1224 /* Get Linux PTE if available */ 1225 pgdir = mm->pgd; 1226 if (pgdir == NULL) 1227 return; 1228 1229 /* Get VSID */ 1230 ssize = user_segment_size(ea); 1231 vsid = get_vsid(mm->context.id, ea, ssize); 1232 if (!vsid) 1233 return; 1234 /* 1235 * Hash doesn't like irqs. Walking linux page table with irq disabled 1236 * saves us from holding multiple locks. 1237 */ 1238 local_irq_save(flags); 1239 1240 /* 1241 * THP pages use update_mmu_cache_pmd. We don't do 1242 * hash preload there. Hence can ignore THP here 1243 */ 1244 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); 1245 if (!ptep) 1246 goto out_exit; 1247 1248 WARN_ON(hugepage_shift); 1249#ifdef CONFIG_PPC_64K_PAGES 1250 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1251 * a 64K kernel), then we don't preload, hash_page() will take 1252 * care of it once we actually try to access the page. 1253 * That way we don't have to duplicate all of the logic for segment 1254 * page size demotion here 1255 */ 1256 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1257 goto out_exit; 1258#endif /* CONFIG_PPC_64K_PAGES */ 1259 1260 /* Is that local to this CPU ? */ 1261 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1262 update_flags |= HPTE_LOCAL_UPDATE; 1263 1264 /* Hash it in */ 1265#ifdef CONFIG_PPC_HAS_HASH_64K 1266 if (mm->context.user_psize == MMU_PAGE_64K) 1267 rc = __hash_page_64K(ea, access, vsid, ptep, trap, 1268 update_flags, ssize); 1269 else 1270#endif /* CONFIG_PPC_HAS_HASH_64K */ 1271 rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, 1272 ssize, subpage_protection(mm, ea)); 1273 1274 /* Dump some info in case of hash insertion failure, they should 1275 * never happen so it is really useful to know if/when they do 1276 */ 1277 if (rc == -1) 1278 hash_failure_debug(ea, access, vsid, trap, ssize, 1279 mm->context.user_psize, 1280 mm->context.user_psize, 1281 pte_val(*ptep)); 1282out_exit: 1283 local_irq_restore(flags); 1284} 1285 1286/* WARNING: This is called from hash_low_64.S, if you change this prototype, 1287 * do not forget to update the assembly call site ! 1288 */ 1289void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, 1290 unsigned long flags) 1291{ 1292 unsigned long hash, index, shift, hidx, slot; 1293 int local = flags & HPTE_LOCAL_UPDATE; 1294 1295 DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); 1296 pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { 1297 hash = hpt_hash(vpn, shift, ssize); 1298 hidx = __rpte_to_hidx(pte, index); 1299 if (hidx & _PTEIDX_SECONDARY) 1300 hash = ~hash; 1301 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1302 slot += hidx & _PTEIDX_GROUP_IX; 1303 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); 1304 /* 1305 * We use same base page size and actual psize, because we don't 1306 * use these functions for hugepage 1307 */ 1308 ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local); 1309 } pte_iterate_hashed_end(); 1310 1311#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1312 /* Transactions are not aborted by tlbiel, only tlbie. 1313 * Without, syncing a page back to a block device w/ PIO could pick up 1314 * transactional data (bad!) so we force an abort here. Before the 1315 * sync the page will be made read-only, which will flush_hash_page. 1316 * BIG ISSUE here: if the kernel uses a page from userspace without 1317 * unmapping it first, it may see the speculated version. 1318 */ 1319 if (local && cpu_has_feature(CPU_FTR_TM) && 1320 current->thread.regs && 1321 MSR_TM_ACTIVE(current->thread.regs->msr)) { 1322 tm_enable(); 1323 tm_abort(TM_CAUSE_TLBI); 1324 } 1325#endif 1326} 1327 1328#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1329void flush_hash_hugepage(unsigned long vsid, unsigned long addr, 1330 pmd_t *pmdp, unsigned int psize, int ssize, 1331 unsigned long flags) 1332{ 1333 int i, max_hpte_count, valid; 1334 unsigned long s_addr; 1335 unsigned char *hpte_slot_array; 1336 unsigned long hidx, shift, vpn, hash, slot; 1337 int local = flags & HPTE_LOCAL_UPDATE; 1338 1339 s_addr = addr & HPAGE_PMD_MASK; 1340 hpte_slot_array = get_hpte_slot_array(pmdp); 1341 /* 1342 * IF we try to do a HUGE PTE update after a withdraw is done. 1343 * we will find the below NULL. This happens when we do 1344 * split_huge_page_pmd 1345 */ 1346 if (!hpte_slot_array) 1347 return; 1348 1349 if (ppc_md.hugepage_invalidate) { 1350 ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array, 1351 psize, ssize, local); 1352 goto tm_abort; 1353 } 1354 /* 1355 * No bluk hpte removal support, invalidate each entry 1356 */ 1357 shift = mmu_psize_defs[psize].shift; 1358 max_hpte_count = HPAGE_PMD_SIZE >> shift; 1359 for (i = 0; i < max_hpte_count; i++) { 1360 /* 1361 * 8 bits per each hpte entries 1362 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] 1363 */ 1364 valid = hpte_valid(hpte_slot_array, i); 1365 if (!valid) 1366 continue; 1367 hidx = hpte_hash_index(hpte_slot_array, i); 1368 1369 /* get the vpn */ 1370 addr = s_addr + (i * (1ul << shift)); 1371 vpn = hpt_vpn(addr, vsid, ssize); 1372 hash = hpt_hash(vpn, shift, ssize); 1373 if (hidx & _PTEIDX_SECONDARY) 1374 hash = ~hash; 1375 1376 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1377 slot += hidx & _PTEIDX_GROUP_IX; 1378 ppc_md.hpte_invalidate(slot, vpn, psize, 1379 MMU_PAGE_16M, ssize, local); 1380 } 1381tm_abort: 1382#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1383 /* Transactions are not aborted by tlbiel, only tlbie. 1384 * Without, syncing a page back to a block device w/ PIO could pick up 1385 * transactional data (bad!) so we force an abort here. Before the 1386 * sync the page will be made read-only, which will flush_hash_page. 1387 * BIG ISSUE here: if the kernel uses a page from userspace without 1388 * unmapping it first, it may see the speculated version. 1389 */ 1390 if (local && cpu_has_feature(CPU_FTR_TM) && 1391 current->thread.regs && 1392 MSR_TM_ACTIVE(current->thread.regs->msr)) { 1393 tm_enable(); 1394 tm_abort(TM_CAUSE_TLBI); 1395 } 1396#endif 1397 return; 1398} 1399#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1400 1401void flush_hash_range(unsigned long number, int local) 1402{ 1403 if (ppc_md.flush_hash_range) 1404 ppc_md.flush_hash_range(number, local); 1405 else { 1406 int i; 1407 struct ppc64_tlb_batch *batch = 1408 this_cpu_ptr(&ppc64_tlb_batch); 1409 1410 for (i = 0; i < number; i++) 1411 flush_hash_page(batch->vpn[i], batch->pte[i], 1412 batch->psize, batch->ssize, local); 1413 } 1414} 1415 1416/* 1417 * low_hash_fault is called when we the low level hash code failed 1418 * to instert a PTE due to an hypervisor error 1419 */ 1420void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) 1421{ 1422 enum ctx_state prev_state = exception_enter(); 1423 1424 if (user_mode(regs)) { 1425#ifdef CONFIG_PPC_SUBPAGE_PROT 1426 if (rc == -2) 1427 _exception(SIGSEGV, regs, SEGV_ACCERR, address); 1428 else 1429#endif 1430 _exception(SIGBUS, regs, BUS_ADRERR, address); 1431 } else 1432 bad_page_fault(regs, address, SIGBUS); 1433 1434 exception_exit(prev_state); 1435} 1436 1437long hpte_insert_repeating(unsigned long hash, unsigned long vpn, 1438 unsigned long pa, unsigned long rflags, 1439 unsigned long vflags, int psize, int ssize) 1440{ 1441 unsigned long hpte_group; 1442 long slot; 1443 1444repeat: 1445 hpte_group = ((hash & htab_hash_mask) * 1446 HPTES_PER_GROUP) & ~0x7UL; 1447 1448 /* Insert into the hash table, primary slot */ 1449 slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags, 1450 psize, psize, ssize); 1451 1452 /* Primary is full, try the secondary */ 1453 if (unlikely(slot == -1)) { 1454 hpte_group = ((~hash & htab_hash_mask) * 1455 HPTES_PER_GROUP) & ~0x7UL; 1456 slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 1457 vflags | HPTE_V_SECONDARY, 1458 psize, psize, ssize); 1459 if (slot == -1) { 1460 if (mftb() & 0x1) 1461 hpte_group = ((hash & htab_hash_mask) * 1462 HPTES_PER_GROUP)&~0x7UL; 1463 1464 ppc_md.hpte_remove(hpte_group); 1465 goto repeat; 1466 } 1467 } 1468 1469 return slot; 1470} 1471 1472#ifdef CONFIG_DEBUG_PAGEALLOC 1473static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) 1474{ 1475 unsigned long hash; 1476 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1477 unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); 1478 unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); 1479 long ret; 1480 1481 hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); 1482 1483 /* Don't create HPTE entries for bad address */ 1484 if (!vsid) 1485 return; 1486 1487 ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, 1488 HPTE_V_BOLTED, 1489 mmu_linear_psize, mmu_kernel_ssize); 1490 1491 BUG_ON (ret < 0); 1492 spin_lock(&linear_map_hash_lock); 1493 BUG_ON(linear_map_hash_slots[lmi] & 0x80); 1494 linear_map_hash_slots[lmi] = ret | 0x80; 1495 spin_unlock(&linear_map_hash_lock); 1496} 1497 1498static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) 1499{ 1500 unsigned long hash, hidx, slot; 1501 unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); 1502 unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); 1503 1504 hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); 1505 spin_lock(&linear_map_hash_lock); 1506 BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); 1507 hidx = linear_map_hash_slots[lmi] & 0x7f; 1508 linear_map_hash_slots[lmi] = 0; 1509 spin_unlock(&linear_map_hash_lock); 1510 if (hidx & _PTEIDX_SECONDARY) 1511 hash = ~hash; 1512 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1513 slot += hidx & _PTEIDX_GROUP_IX; 1514 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize, 1515 mmu_kernel_ssize, 0); 1516} 1517 1518void __kernel_map_pages(struct page *page, int numpages, int enable) 1519{ 1520 unsigned long flags, vaddr, lmi; 1521 int i; 1522 1523 local_irq_save(flags); 1524 for (i = 0; i < numpages; i++, page++) { 1525 vaddr = (unsigned long)page_address(page); 1526 lmi = __pa(vaddr) >> PAGE_SHIFT; 1527 if (lmi >= linear_map_hash_count) 1528 continue; 1529 if (enable) 1530 kernel_map_linear_page(vaddr, lmi); 1531 else 1532 kernel_unmap_linear_page(vaddr, lmi); 1533 } 1534 local_irq_restore(flags); 1535} 1536#endif /* CONFIG_DEBUG_PAGEALLOC */ 1537 1538void setup_initial_memory_limit(phys_addr_t first_memblock_base, 1539 phys_addr_t first_memblock_size) 1540{ 1541 /* We don't currently support the first MEMBLOCK not mapping 0 1542 * physical on those processors 1543 */ 1544 BUG_ON(first_memblock_base != 0); 1545 1546 /* On LPAR systems, the first entry is our RMA region, 1547 * non-LPAR 64-bit hash MMU systems don't have a limitation 1548 * on real mode access, but using the first entry works well 1549 * enough. We also clamp it to 1G to avoid some funky things 1550 * such as RTAS bugs etc... 1551 */ 1552 ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); 1553 1554 /* Finally limit subsequent allocations */ 1555 memblock_set_current_limit(ppc64_rma_size); 1556} 1557