1/* 2 * Copyright 2002 Andi Kleen, SuSE Labs. 3 * Thanks to Ben LaHaise for precious feedback. 4 */ 5#include <linux/highmem.h> 6#include <linux/bootmem.h> 7#include <linux/sched.h> 8#include <linux/mm.h> 9#include <linux/interrupt.h> 10#include <linux/seq_file.h> 11#include <linux/debugfs.h> 12#include <linux/pfn.h> 13#include <linux/percpu.h> 14#include <linux/gfp.h> 15#include <linux/pci.h> 16#include <linux/vmalloc.h> 17 18#include <asm/e820.h> 19#include <asm/processor.h> 20#include <asm/tlbflush.h> 21#include <asm/sections.h> 22#include <asm/setup.h> 23#include <asm/uaccess.h> 24#include <asm/pgalloc.h> 25#include <asm/proto.h> 26#include <asm/pat.h> 27 28/* 29 * The current flushing context - we pass it instead of 5 arguments: 30 */ 31struct cpa_data { 32 unsigned long *vaddr; 33 pgd_t *pgd; 34 pgprot_t mask_set; 35 pgprot_t mask_clr; 36 unsigned long numpages; 37 int flags; 38 unsigned long pfn; 39 unsigned force_split : 1; 40 int curpage; 41 struct page **pages; 42}; 43 44/* 45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) 46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb 47 * entries change the page attribute in parallel to some other cpu 48 * splitting a large page entry along with changing the attribute. 49 */ 50static DEFINE_SPINLOCK(cpa_lock); 51 52#define CPA_FLUSHTLB 1 53#define CPA_ARRAY 2 54#define CPA_PAGES_ARRAY 4 55 56#ifdef CONFIG_PROC_FS 57static unsigned long direct_pages_count[PG_LEVEL_NUM]; 58 59void update_page_count(int level, unsigned long pages) 60{ 61 /* Protect against CPA */ 62 spin_lock(&pgd_lock); 63 direct_pages_count[level] += pages; 64 spin_unlock(&pgd_lock); 65} 66 67static void split_page_count(int level) 68{ 69 direct_pages_count[level]--; 70 direct_pages_count[level - 1] += PTRS_PER_PTE; 71} 72 73void arch_report_meminfo(struct seq_file *m) 74{ 75 seq_printf(m, "DirectMap4k: %8lu kB\n", 76 direct_pages_count[PG_LEVEL_4K] << 2); 77#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 78 seq_printf(m, "DirectMap2M: %8lu kB\n", 79 direct_pages_count[PG_LEVEL_2M] << 11); 80#else 81 seq_printf(m, "DirectMap4M: %8lu kB\n", 82 direct_pages_count[PG_LEVEL_2M] << 12); 83#endif 84 if (direct_gbpages) 85 seq_printf(m, "DirectMap1G: %8lu kB\n", 86 direct_pages_count[PG_LEVEL_1G] << 20); 87} 88#else 89static inline void split_page_count(int level) { } 90#endif 91 92#ifdef CONFIG_X86_64 93 94static inline unsigned long highmap_start_pfn(void) 95{ 96 return __pa_symbol(_text) >> PAGE_SHIFT; 97} 98 99static inline unsigned long highmap_end_pfn(void) 100{ 101 return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; 102} 103 104#endif 105 106#ifdef CONFIG_DEBUG_PAGEALLOC 107# define debug_pagealloc 1 108#else 109# define debug_pagealloc 0 110#endif 111 112static inline int 113within(unsigned long addr, unsigned long start, unsigned long end) 114{ 115 return addr >= start && addr < end; 116} 117 118/* 119 * Flushing functions 120 */ 121 122/** 123 * clflush_cache_range - flush a cache range with clflush 124 * @vaddr: virtual start address 125 * @size: number of bytes to flush 126 * 127 * clflushopt is an unordered instruction which needs fencing with mfence or 128 * sfence to avoid ordering issues. 129 */ 130void clflush_cache_range(void *vaddr, unsigned int size) 131{ 132 unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; 133 void *vend = vaddr + size; 134 void *p; 135 136 mb(); 137 138 for (p = (void *)((unsigned long)vaddr & ~clflush_mask); 139 p < vend; p += boot_cpu_data.x86_clflush_size) 140 clflushopt(p); 141 142 mb(); 143} 144EXPORT_SYMBOL_GPL(clflush_cache_range); 145 146static void __cpa_flush_all(void *arg) 147{ 148 unsigned long cache = (unsigned long)arg; 149 150 /* 151 * Flush all to work around Errata in early athlons regarding 152 * large page flushing. 153 */ 154 __flush_tlb_all(); 155 156 if (cache && boot_cpu_data.x86 >= 4) 157 wbinvd(); 158} 159 160static void cpa_flush_all(unsigned long cache) 161{ 162 BUG_ON(irqs_disabled()); 163 164 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 165} 166 167static void __cpa_flush_range(void *arg) 168{ 169 /* 170 * We could optimize that further and do individual per page 171 * tlb invalidates for a low number of pages. Caveat: we must 172 * flush the high aliases on 64bit as well. 173 */ 174 __flush_tlb_all(); 175} 176 177static void cpa_flush_range(unsigned long start, int numpages, int cache) 178{ 179 unsigned int i, level; 180 unsigned long addr; 181 182 BUG_ON(irqs_disabled()); 183 WARN_ON(PAGE_ALIGN(start) != start); 184 185 on_each_cpu(__cpa_flush_range, NULL, 1); 186 187 if (!cache) 188 return; 189 190 /* 191 * We only need to flush on one CPU, 192 * clflush is a MESI-coherent instruction that 193 * will cause all other CPUs to flush the same 194 * cachelines: 195 */ 196 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { 197 pte_t *pte = lookup_address(addr, &level); 198 199 /* 200 * Only flush present addresses: 201 */ 202 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 203 clflush_cache_range((void *) addr, PAGE_SIZE); 204 } 205} 206 207static void cpa_flush_array(unsigned long *start, int numpages, int cache, 208 int in_flags, struct page **pages) 209{ 210 unsigned int i, level; 211 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ 212 213 BUG_ON(irqs_disabled()); 214 215 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); 216 217 if (!cache || do_wbinvd) 218 return; 219 220 /* 221 * We only need to flush on one CPU, 222 * clflush is a MESI-coherent instruction that 223 * will cause all other CPUs to flush the same 224 * cachelines: 225 */ 226 for (i = 0; i < numpages; i++) { 227 unsigned long addr; 228 pte_t *pte; 229 230 if (in_flags & CPA_PAGES_ARRAY) 231 addr = (unsigned long)page_address(pages[i]); 232 else 233 addr = start[i]; 234 235 pte = lookup_address(addr, &level); 236 237 /* 238 * Only flush present addresses: 239 */ 240 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 241 clflush_cache_range((void *)addr, PAGE_SIZE); 242 } 243} 244 245/* 246 * Certain areas of memory on x86 require very specific protection flags, 247 * for example the BIOS area or kernel text. Callers don't always get this 248 * right (again, ioremap() on BIOS memory is not uncommon) so this function 249 * checks and fixes these known static required protection bits. 250 */ 251static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, 252 unsigned long pfn) 253{ 254 pgprot_t forbidden = __pgprot(0); 255 256 /* 257 * The BIOS area between 640k and 1Mb needs to be executable for 258 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 259 */ 260#ifdef CONFIG_PCI_BIOS 261 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 262 pgprot_val(forbidden) |= _PAGE_NX; 263#endif 264 265 /* 266 * The kernel text needs to be executable for obvious reasons 267 * Does not cover __inittext since that is gone later on. On 268 * 64bit we do not enforce !NX on the low mapping 269 */ 270 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 271 pgprot_val(forbidden) |= _PAGE_NX; 272 273 /* 274 * The .rodata section needs to be read-only. Using the pfn 275 * catches all aliases. 276 */ 277 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, 278 __pa_symbol(__end_rodata) >> PAGE_SHIFT)) 279 pgprot_val(forbidden) |= _PAGE_RW; 280 281#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 282 /* 283 * Once the kernel maps the text as RO (kernel_set_to_readonly is set), 284 * kernel text mappings for the large page aligned text, rodata sections 285 * will be always read-only. For the kernel identity mappings covering 286 * the holes caused by this alignment can be anything that user asks. 287 * 288 * This will preserve the large page mappings for kernel text/data 289 * at no extra cost. 290 */ 291 if (kernel_set_to_readonly && 292 within(address, (unsigned long)_text, 293 (unsigned long)__end_rodata_hpage_align)) { 294 unsigned int level; 295 296 /* 297 * Don't enforce the !RW mapping for the kernel text mapping, 298 * if the current mapping is already using small page mapping. 299 * No need to work hard to preserve large page mappings in this 300 * case. 301 * 302 * This also fixes the Linux Xen paravirt guest boot failure 303 * (because of unexpected read-only mappings for kernel identity 304 * mappings). In this paravirt guest case, the kernel text 305 * mapping and the kernel identity mapping share the same 306 * page-table pages. Thus we can't really use different 307 * protections for the kernel text and identity mappings. Also, 308 * these shared mappings are made of small page mappings. 309 * Thus this don't enforce !RW mapping for small page kernel 310 * text mapping logic will help Linux Xen parvirt guest boot 311 * as well. 312 */ 313 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 314 pgprot_val(forbidden) |= _PAGE_RW; 315 } 316#endif 317 318 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 319 320 return prot; 321} 322 323/* 324 * Lookup the page table entry for a virtual address in a specific pgd. 325 * Return a pointer to the entry and the level of the mapping. 326 */ 327pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 328 unsigned int *level) 329{ 330 pud_t *pud; 331 pmd_t *pmd; 332 333 *level = PG_LEVEL_NONE; 334 335 if (pgd_none(*pgd)) 336 return NULL; 337 338 pud = pud_offset(pgd, address); 339 if (pud_none(*pud)) 340 return NULL; 341 342 *level = PG_LEVEL_1G; 343 if (pud_large(*pud) || !pud_present(*pud)) 344 return (pte_t *)pud; 345 346 pmd = pmd_offset(pud, address); 347 if (pmd_none(*pmd)) 348 return NULL; 349 350 *level = PG_LEVEL_2M; 351 if (pmd_large(*pmd) || !pmd_present(*pmd)) 352 return (pte_t *)pmd; 353 354 *level = PG_LEVEL_4K; 355 356 return pte_offset_kernel(pmd, address); 357} 358 359/* 360 * Lookup the page table entry for a virtual address. Return a pointer 361 * to the entry and the level of the mapping. 362 * 363 * Note: We return pud and pmd either when the entry is marked large 364 * or when the present bit is not set. Otherwise we would return a 365 * pointer to a nonexisting mapping. 366 */ 367pte_t *lookup_address(unsigned long address, unsigned int *level) 368{ 369 return lookup_address_in_pgd(pgd_offset_k(address), address, level); 370} 371EXPORT_SYMBOL_GPL(lookup_address); 372 373static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, 374 unsigned int *level) 375{ 376 if (cpa->pgd) 377 return lookup_address_in_pgd(cpa->pgd + pgd_index(address), 378 address, level); 379 380 return lookup_address(address, level); 381} 382 383/* 384 * Lookup the PMD entry for a virtual address. Return a pointer to the entry 385 * or NULL if not present. 386 */ 387pmd_t *lookup_pmd_address(unsigned long address) 388{ 389 pgd_t *pgd; 390 pud_t *pud; 391 392 pgd = pgd_offset_k(address); 393 if (pgd_none(*pgd)) 394 return NULL; 395 396 pud = pud_offset(pgd, address); 397 if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) 398 return NULL; 399 400 return pmd_offset(pud, address); 401} 402 403/* 404 * This is necessary because __pa() does not work on some 405 * kinds of memory, like vmalloc() or the alloc_remap() 406 * areas on 32-bit NUMA systems. The percpu areas can 407 * end up in this kind of memory, for instance. 408 * 409 * This could be optimized, but it is only intended to be 410 * used at inititalization time, and keeping it 411 * unoptimized should increase the testing coverage for 412 * the more obscure platforms. 413 */ 414phys_addr_t slow_virt_to_phys(void *__virt_addr) 415{ 416 unsigned long virt_addr = (unsigned long)__virt_addr; 417 phys_addr_t phys_addr; 418 unsigned long offset; 419 enum pg_level level; 420 pte_t *pte; 421 422 pte = lookup_address(virt_addr, &level); 423 BUG_ON(!pte); 424 425 /* 426 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t 427 * before being left-shifted PAGE_SHIFT bits -- this trick is to 428 * make 32-PAE kernel work correctly. 429 */ 430 switch (level) { 431 case PG_LEVEL_1G: 432 phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; 433 offset = virt_addr & ~PUD_PAGE_MASK; 434 break; 435 case PG_LEVEL_2M: 436 phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; 437 offset = virt_addr & ~PMD_PAGE_MASK; 438 break; 439 default: 440 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 441 offset = virt_addr & ~PAGE_MASK; 442 } 443 444 return (phys_addr_t)(phys_addr | offset); 445} 446EXPORT_SYMBOL_GPL(slow_virt_to_phys); 447 448/* 449 * Set the new pmd in all the pgds we know about: 450 */ 451static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 452{ 453 /* change init_mm */ 454 set_pte_atomic(kpte, pte); 455#ifdef CONFIG_X86_32 456 if (!SHARED_KERNEL_PMD) { 457 struct page *page; 458 459 list_for_each_entry(page, &pgd_list, lru) { 460 pgd_t *pgd; 461 pud_t *pud; 462 pmd_t *pmd; 463 464 pgd = (pgd_t *)page_address(page) + pgd_index(address); 465 pud = pud_offset(pgd, address); 466 pmd = pmd_offset(pud, address); 467 set_pte_atomic((pte_t *)pmd, pte); 468 } 469 } 470#endif 471} 472 473static int 474try_preserve_large_page(pte_t *kpte, unsigned long address, 475 struct cpa_data *cpa) 476{ 477 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; 478 pte_t new_pte, old_pte, *tmp; 479 pgprot_t old_prot, new_prot, req_prot; 480 int i, do_split = 1; 481 enum pg_level level; 482 483 if (cpa->force_split) 484 return 1; 485 486 spin_lock(&pgd_lock); 487 /* 488 * Check for races, another CPU might have split this page 489 * up already: 490 */ 491 tmp = _lookup_address_cpa(cpa, address, &level); 492 if (tmp != kpte) 493 goto out_unlock; 494 495 switch (level) { 496 case PG_LEVEL_2M: 497 old_prot = pmd_pgprot(*(pmd_t *)kpte); 498 old_pfn = pmd_pfn(*(pmd_t *)kpte); 499 break; 500 case PG_LEVEL_1G: 501 old_prot = pud_pgprot(*(pud_t *)kpte); 502 old_pfn = pud_pfn(*(pud_t *)kpte); 503 break; 504 default: 505 do_split = -EINVAL; 506 goto out_unlock; 507 } 508 509 psize = page_level_size(level); 510 pmask = page_level_mask(level); 511 512 /* 513 * Calculate the number of pages, which fit into this large 514 * page starting at address: 515 */ 516 nextpage_addr = (address + psize) & pmask; 517 numpages = (nextpage_addr - address) >> PAGE_SHIFT; 518 if (numpages < cpa->numpages) 519 cpa->numpages = numpages; 520 521 /* 522 * We are safe now. Check whether the new pgprot is the same: 523 * Convert protection attributes to 4k-format, as cpa->mask* are set 524 * up accordingly. 525 */ 526 old_pte = *kpte; 527 req_prot = pgprot_large_2_4k(old_prot); 528 529 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); 530 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); 531 532 /* 533 * req_prot is in format of 4k pages. It must be converted to large 534 * page format: the caching mode includes the PAT bit located at 535 * different bit positions in the two formats. 536 */ 537 req_prot = pgprot_4k_2_large(req_prot); 538 539 /* 540 * Set the PSE and GLOBAL flags only if the PRESENT flag is 541 * set otherwise pmd_present/pmd_huge will return true even on 542 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL 543 * for the ancient hardware that doesn't support it. 544 */ 545 if (pgprot_val(req_prot) & _PAGE_PRESENT) 546 pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; 547 else 548 pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); 549 550 req_prot = canon_pgprot(req_prot); 551 552 /* 553 * old_pfn points to the large page base pfn. So we need 554 * to add the offset of the virtual address: 555 */ 556 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); 557 cpa->pfn = pfn; 558 559 new_prot = static_protections(req_prot, address, pfn); 560 561 /* 562 * We need to check the full range, whether 563 * static_protection() requires a different pgprot for one of 564 * the pages in the range we try to preserve: 565 */ 566 addr = address & pmask; 567 pfn = old_pfn; 568 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { 569 pgprot_t chk_prot = static_protections(req_prot, addr, pfn); 570 571 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 572 goto out_unlock; 573 } 574 575 /* 576 * If there are no changes, return. maxpages has been updated 577 * above: 578 */ 579 if (pgprot_val(new_prot) == pgprot_val(old_prot)) { 580 do_split = 0; 581 goto out_unlock; 582 } 583 584 /* 585 * We need to change the attributes. Check, whether we can 586 * change the large page in one go. We request a split, when 587 * the address is not aligned and the number of pages is 588 * smaller than the number of pages in the large page. Note 589 * that we limited the number of possible pages already to 590 * the number of pages in the large page. 591 */ 592 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { 593 /* 594 * The address is aligned and the number of pages 595 * covers the full page. 596 */ 597 new_pte = pfn_pte(old_pfn, new_prot); 598 __set_pmd_pte(kpte, address, new_pte); 599 cpa->flags |= CPA_FLUSHTLB; 600 do_split = 0; 601 } 602 603out_unlock: 604 spin_unlock(&pgd_lock); 605 606 return do_split; 607} 608 609static int 610__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, 611 struct page *base) 612{ 613 pte_t *pbase = (pte_t *)page_address(base); 614 unsigned long ref_pfn, pfn, pfninc = 1; 615 unsigned int i, level; 616 pte_t *tmp; 617 pgprot_t ref_prot; 618 619 spin_lock(&pgd_lock); 620 /* 621 * Check for races, another CPU might have split this page 622 * up for us already: 623 */ 624 tmp = _lookup_address_cpa(cpa, address, &level); 625 if (tmp != kpte) { 626 spin_unlock(&pgd_lock); 627 return 1; 628 } 629 630 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 631 632 switch (level) { 633 case PG_LEVEL_2M: 634 ref_prot = pmd_pgprot(*(pmd_t *)kpte); 635 /* clear PSE and promote PAT bit to correct position */ 636 ref_prot = pgprot_large_2_4k(ref_prot); 637 ref_pfn = pmd_pfn(*(pmd_t *)kpte); 638 break; 639 640 case PG_LEVEL_1G: 641 ref_prot = pud_pgprot(*(pud_t *)kpte); 642 ref_pfn = pud_pfn(*(pud_t *)kpte); 643 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; 644 645 /* 646 * Clear the PSE flags if the PRESENT flag is not set 647 * otherwise pmd_present/pmd_huge will return true 648 * even on a non present pmd. 649 */ 650 if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) 651 pgprot_val(ref_prot) &= ~_PAGE_PSE; 652 break; 653 654 default: 655 spin_unlock(&pgd_lock); 656 return 1; 657 } 658 659 /* 660 * Set the GLOBAL flags only if the PRESENT flag is set 661 * otherwise pmd/pte_present will return true even on a non 662 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL 663 * for the ancient hardware that doesn't support it. 664 */ 665 if (pgprot_val(ref_prot) & _PAGE_PRESENT) 666 pgprot_val(ref_prot) |= _PAGE_GLOBAL; 667 else 668 pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; 669 670 /* 671 * Get the target pfn from the original entry: 672 */ 673 pfn = ref_pfn; 674 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 675 set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); 676 677 if (virt_addr_valid(address)) { 678 unsigned long pfn = PFN_DOWN(__pa(address)); 679 680 if (pfn_range_is_mapped(pfn, pfn + 1)) 681 split_page_count(level); 682 } 683 684 /* 685 * Install the new, split up pagetable. 686 * 687 * We use the standard kernel pagetable protections for the new 688 * pagetable protections, the actual ptes set above control the 689 * primary protection behavior: 690 */ 691 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 692 693 /* 694 * Intel Atom errata AAH41 workaround. 695 * 696 * The real fix should be in hw or in a microcode update, but 697 * we also probabilistically try to reduce the window of having 698 * a large TLB mixed with 4K TLBs while instruction fetches are 699 * going on. 700 */ 701 __flush_tlb_all(); 702 spin_unlock(&pgd_lock); 703 704 return 0; 705} 706 707static int split_large_page(struct cpa_data *cpa, pte_t *kpte, 708 unsigned long address) 709{ 710 struct page *base; 711 712 if (!debug_pagealloc) 713 spin_unlock(&cpa_lock); 714 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 715 if (!debug_pagealloc) 716 spin_lock(&cpa_lock); 717 if (!base) 718 return -ENOMEM; 719 720 if (__split_large_page(cpa, kpte, address, base)) 721 __free_page(base); 722 723 return 0; 724} 725 726static bool try_to_free_pte_page(pte_t *pte) 727{ 728 int i; 729 730 for (i = 0; i < PTRS_PER_PTE; i++) 731 if (!pte_none(pte[i])) 732 return false; 733 734 free_page((unsigned long)pte); 735 return true; 736} 737 738static bool try_to_free_pmd_page(pmd_t *pmd) 739{ 740 int i; 741 742 for (i = 0; i < PTRS_PER_PMD; i++) 743 if (!pmd_none(pmd[i])) 744 return false; 745 746 free_page((unsigned long)pmd); 747 return true; 748} 749 750static bool try_to_free_pud_page(pud_t *pud) 751{ 752 int i; 753 754 for (i = 0; i < PTRS_PER_PUD; i++) 755 if (!pud_none(pud[i])) 756 return false; 757 758 free_page((unsigned long)pud); 759 return true; 760} 761 762static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) 763{ 764 pte_t *pte = pte_offset_kernel(pmd, start); 765 766 while (start < end) { 767 set_pte(pte, __pte(0)); 768 769 start += PAGE_SIZE; 770 pte++; 771 } 772 773 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { 774 pmd_clear(pmd); 775 return true; 776 } 777 return false; 778} 779 780static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, 781 unsigned long start, unsigned long end) 782{ 783 if (unmap_pte_range(pmd, start, end)) 784 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 785 pud_clear(pud); 786} 787 788static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) 789{ 790 pmd_t *pmd = pmd_offset(pud, start); 791 792 /* 793 * Not on a 2MB page boundary? 794 */ 795 if (start & (PMD_SIZE - 1)) { 796 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 797 unsigned long pre_end = min_t(unsigned long, end, next_page); 798 799 __unmap_pmd_range(pud, pmd, start, pre_end); 800 801 start = pre_end; 802 pmd++; 803 } 804 805 /* 806 * Try to unmap in 2M chunks. 807 */ 808 while (end - start >= PMD_SIZE) { 809 if (pmd_large(*pmd)) 810 pmd_clear(pmd); 811 else 812 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); 813 814 start += PMD_SIZE; 815 pmd++; 816 } 817 818 /* 819 * 4K leftovers? 820 */ 821 if (start < end) 822 return __unmap_pmd_range(pud, pmd, start, end); 823 824 /* 825 * Try again to free the PMD page if haven't succeeded above. 826 */ 827 if (!pud_none(*pud)) 828 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 829 pud_clear(pud); 830} 831 832static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) 833{ 834 pud_t *pud = pud_offset(pgd, start); 835 836 /* 837 * Not on a GB page boundary? 838 */ 839 if (start & (PUD_SIZE - 1)) { 840 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 841 unsigned long pre_end = min_t(unsigned long, end, next_page); 842 843 unmap_pmd_range(pud, start, pre_end); 844 845 start = pre_end; 846 pud++; 847 } 848 849 /* 850 * Try to unmap in 1G chunks? 851 */ 852 while (end - start >= PUD_SIZE) { 853 854 if (pud_large(*pud)) 855 pud_clear(pud); 856 else 857 unmap_pmd_range(pud, start, start + PUD_SIZE); 858 859 start += PUD_SIZE; 860 pud++; 861 } 862 863 /* 864 * 2M leftovers? 865 */ 866 if (start < end) 867 unmap_pmd_range(pud, start, end); 868 869 /* 870 * No need to try to free the PUD page because we'll free it in 871 * populate_pgd's error path 872 */ 873} 874 875static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) 876{ 877 pgd_t *pgd_entry = root + pgd_index(addr); 878 879 unmap_pud_range(pgd_entry, addr, end); 880 881 if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) 882 pgd_clear(pgd_entry); 883} 884 885static int alloc_pte_page(pmd_t *pmd) 886{ 887 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 888 if (!pte) 889 return -1; 890 891 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 892 return 0; 893} 894 895static int alloc_pmd_page(pud_t *pud) 896{ 897 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 898 if (!pmd) 899 return -1; 900 901 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 902 return 0; 903} 904 905static void populate_pte(struct cpa_data *cpa, 906 unsigned long start, unsigned long end, 907 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) 908{ 909 pte_t *pte; 910 911 pte = pte_offset_kernel(pmd, start); 912 913 while (num_pages-- && start < end) { 914 915 /* deal with the NX bit */ 916 if (!(pgprot_val(pgprot) & _PAGE_NX)) 917 cpa->pfn &= ~_PAGE_NX; 918 919 set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); 920 921 start += PAGE_SIZE; 922 cpa->pfn += PAGE_SIZE; 923 pte++; 924 } 925} 926 927static int populate_pmd(struct cpa_data *cpa, 928 unsigned long start, unsigned long end, 929 unsigned num_pages, pud_t *pud, pgprot_t pgprot) 930{ 931 unsigned int cur_pages = 0; 932 pmd_t *pmd; 933 pgprot_t pmd_pgprot; 934 935 /* 936 * Not on a 2M boundary? 937 */ 938 if (start & (PMD_SIZE - 1)) { 939 unsigned long pre_end = start + (num_pages << PAGE_SHIFT); 940 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 941 942 pre_end = min_t(unsigned long, pre_end, next_page); 943 cur_pages = (pre_end - start) >> PAGE_SHIFT; 944 cur_pages = min_t(unsigned int, num_pages, cur_pages); 945 946 /* 947 * Need a PTE page? 948 */ 949 pmd = pmd_offset(pud, start); 950 if (pmd_none(*pmd)) 951 if (alloc_pte_page(pmd)) 952 return -1; 953 954 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); 955 956 start = pre_end; 957 } 958 959 /* 960 * We mapped them all? 961 */ 962 if (num_pages == cur_pages) 963 return cur_pages; 964 965 pmd_pgprot = pgprot_4k_2_large(pgprot); 966 967 while (end - start >= PMD_SIZE) { 968 969 /* 970 * We cannot use a 1G page so allocate a PMD page if needed. 971 */ 972 if (pud_none(*pud)) 973 if (alloc_pmd_page(pud)) 974 return -1; 975 976 pmd = pmd_offset(pud, start); 977 978 set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | 979 massage_pgprot(pmd_pgprot))); 980 981 start += PMD_SIZE; 982 cpa->pfn += PMD_SIZE; 983 cur_pages += PMD_SIZE >> PAGE_SHIFT; 984 } 985 986 /* 987 * Map trailing 4K pages. 988 */ 989 if (start < end) { 990 pmd = pmd_offset(pud, start); 991 if (pmd_none(*pmd)) 992 if (alloc_pte_page(pmd)) 993 return -1; 994 995 populate_pte(cpa, start, end, num_pages - cur_pages, 996 pmd, pgprot); 997 } 998 return num_pages; 999} 1000 1001static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, 1002 pgprot_t pgprot) 1003{ 1004 pud_t *pud; 1005 unsigned long end; 1006 int cur_pages = 0; 1007 pgprot_t pud_pgprot; 1008 1009 end = start + (cpa->numpages << PAGE_SHIFT); 1010 1011 /* 1012 * Not on a Gb page boundary? => map everything up to it with 1013 * smaller pages. 1014 */ 1015 if (start & (PUD_SIZE - 1)) { 1016 unsigned long pre_end; 1017 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 1018 1019 pre_end = min_t(unsigned long, end, next_page); 1020 cur_pages = (pre_end - start) >> PAGE_SHIFT; 1021 cur_pages = min_t(int, (int)cpa->numpages, cur_pages); 1022 1023 pud = pud_offset(pgd, start); 1024 1025 /* 1026 * Need a PMD page? 1027 */ 1028 if (pud_none(*pud)) 1029 if (alloc_pmd_page(pud)) 1030 return -1; 1031 1032 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, 1033 pud, pgprot); 1034 if (cur_pages < 0) 1035 return cur_pages; 1036 1037 start = pre_end; 1038 } 1039 1040 /* We mapped them all? */ 1041 if (cpa->numpages == cur_pages) 1042 return cur_pages; 1043 1044 pud = pud_offset(pgd, start); 1045 pud_pgprot = pgprot_4k_2_large(pgprot); 1046 1047 /* 1048 * Map everything starting from the Gb boundary, possibly with 1G pages 1049 */ 1050 while (end - start >= PUD_SIZE) { 1051 set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | 1052 massage_pgprot(pud_pgprot))); 1053 1054 start += PUD_SIZE; 1055 cpa->pfn += PUD_SIZE; 1056 cur_pages += PUD_SIZE >> PAGE_SHIFT; 1057 pud++; 1058 } 1059 1060 /* Map trailing leftover */ 1061 if (start < end) { 1062 int tmp; 1063 1064 pud = pud_offset(pgd, start); 1065 if (pud_none(*pud)) 1066 if (alloc_pmd_page(pud)) 1067 return -1; 1068 1069 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, 1070 pud, pgprot); 1071 if (tmp < 0) 1072 return cur_pages; 1073 1074 cur_pages += tmp; 1075 } 1076 return cur_pages; 1077} 1078 1079/* 1080 * Restrictions for kernel page table do not necessarily apply when mapping in 1081 * an alternate PGD. 1082 */ 1083static int populate_pgd(struct cpa_data *cpa, unsigned long addr) 1084{ 1085 pgprot_t pgprot = __pgprot(_KERNPG_TABLE); 1086 pud_t *pud = NULL; /* shut up gcc */ 1087 pgd_t *pgd_entry; 1088 int ret; 1089 1090 pgd_entry = cpa->pgd + pgd_index(addr); 1091 1092 /* 1093 * Allocate a PUD page and hand it down for mapping. 1094 */ 1095 if (pgd_none(*pgd_entry)) { 1096 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1097 if (!pud) 1098 return -1; 1099 1100 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); 1101 } 1102 1103 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); 1104 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); 1105 1106 ret = populate_pud(cpa, addr, pgd_entry, pgprot); 1107 if (ret < 0) { 1108 unmap_pgd_range(cpa->pgd, addr, 1109 addr + (cpa->numpages << PAGE_SHIFT)); 1110 return ret; 1111 } 1112 1113 cpa->numpages = ret; 1114 return 0; 1115} 1116 1117static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 1118 int primary) 1119{ 1120 if (cpa->pgd) 1121 return populate_pgd(cpa, vaddr); 1122 1123 /* 1124 * Ignore all non primary paths. 1125 */ 1126 if (!primary) 1127 return 0; 1128 1129 /* 1130 * Ignore the NULL PTE for kernel identity mapping, as it is expected 1131 * to have holes. 1132 * Also set numpages to '1' indicating that we processed cpa req for 1133 * one virtual address page and its pfn. TBD: numpages can be set based 1134 * on the initial value and the level returned by lookup_address(). 1135 */ 1136 if (within(vaddr, PAGE_OFFSET, 1137 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 1138 cpa->numpages = 1; 1139 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; 1140 return 0; 1141 } else { 1142 WARN(1, KERN_WARNING "CPA: called for zero pte. " 1143 "vaddr = %lx cpa->vaddr = %lx\n", vaddr, 1144 *cpa->vaddr); 1145 1146 return -EFAULT; 1147 } 1148} 1149 1150static int __change_page_attr(struct cpa_data *cpa, int primary) 1151{ 1152 unsigned long address; 1153 int do_split, err; 1154 unsigned int level; 1155 pte_t *kpte, old_pte; 1156 1157 if (cpa->flags & CPA_PAGES_ARRAY) { 1158 struct page *page = cpa->pages[cpa->curpage]; 1159 if (unlikely(PageHighMem(page))) 1160 return 0; 1161 address = (unsigned long)page_address(page); 1162 } else if (cpa->flags & CPA_ARRAY) 1163 address = cpa->vaddr[cpa->curpage]; 1164 else 1165 address = *cpa->vaddr; 1166repeat: 1167 kpte = _lookup_address_cpa(cpa, address, &level); 1168 if (!kpte) 1169 return __cpa_process_fault(cpa, address, primary); 1170 1171 old_pte = *kpte; 1172 if (!pte_val(old_pte)) 1173 return __cpa_process_fault(cpa, address, primary); 1174 1175 if (level == PG_LEVEL_4K) { 1176 pte_t new_pte; 1177 pgprot_t new_prot = pte_pgprot(old_pte); 1178 unsigned long pfn = pte_pfn(old_pte); 1179 1180 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 1181 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 1182 1183 new_prot = static_protections(new_prot, address, pfn); 1184 1185 /* 1186 * Set the GLOBAL flags only if the PRESENT flag is 1187 * set otherwise pte_present will return true even on 1188 * a non present pte. The canon_pgprot will clear 1189 * _PAGE_GLOBAL for the ancient hardware that doesn't 1190 * support it. 1191 */ 1192 if (pgprot_val(new_prot) & _PAGE_PRESENT) 1193 pgprot_val(new_prot) |= _PAGE_GLOBAL; 1194 else 1195 pgprot_val(new_prot) &= ~_PAGE_GLOBAL; 1196 1197 /* 1198 * We need to keep the pfn from the existing PTE, 1199 * after all we're only going to change it's attributes 1200 * not the memory it points to 1201 */ 1202 new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); 1203 cpa->pfn = pfn; 1204 /* 1205 * Do we really change anything ? 1206 */ 1207 if (pte_val(old_pte) != pte_val(new_pte)) { 1208 set_pte_atomic(kpte, new_pte); 1209 cpa->flags |= CPA_FLUSHTLB; 1210 } 1211 cpa->numpages = 1; 1212 return 0; 1213 } 1214 1215 /* 1216 * Check, whether we can keep the large page intact 1217 * and just change the pte: 1218 */ 1219 do_split = try_preserve_large_page(kpte, address, cpa); 1220 /* 1221 * When the range fits into the existing large page, 1222 * return. cp->numpages and cpa->tlbflush have been updated in 1223 * try_large_page: 1224 */ 1225 if (do_split <= 0) 1226 return do_split; 1227 1228 /* 1229 * We have to split the large page: 1230 */ 1231 err = split_large_page(cpa, kpte, address); 1232 if (!err) { 1233 /* 1234 * Do a global flush tlb after splitting the large page 1235 * and before we do the actual change page attribute in the PTE. 1236 * 1237 * With out this, we violate the TLB application note, that says 1238 * "The TLBs may contain both ordinary and large-page 1239 * translations for a 4-KByte range of linear addresses. This 1240 * may occur if software modifies the paging structures so that 1241 * the page size used for the address range changes. If the two 1242 * translations differ with respect to page frame or attributes 1243 * (e.g., permissions), processor behavior is undefined and may 1244 * be implementation-specific." 1245 * 1246 * We do this global tlb flush inside the cpa_lock, so that we 1247 * don't allow any other cpu, with stale tlb entries change the 1248 * page attribute in parallel, that also falls into the 1249 * just split large page entry. 1250 */ 1251 flush_tlb_all(); 1252 goto repeat; 1253 } 1254 1255 return err; 1256} 1257 1258static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); 1259 1260static int cpa_process_alias(struct cpa_data *cpa) 1261{ 1262 struct cpa_data alias_cpa; 1263 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 1264 unsigned long vaddr; 1265 int ret; 1266 1267 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) 1268 return 0; 1269 1270 /* 1271 * No need to redo, when the primary call touched the direct 1272 * mapping already: 1273 */ 1274 if (cpa->flags & CPA_PAGES_ARRAY) { 1275 struct page *page = cpa->pages[cpa->curpage]; 1276 if (unlikely(PageHighMem(page))) 1277 return 0; 1278 vaddr = (unsigned long)page_address(page); 1279 } else if (cpa->flags & CPA_ARRAY) 1280 vaddr = cpa->vaddr[cpa->curpage]; 1281 else 1282 vaddr = *cpa->vaddr; 1283 1284 if (!(within(vaddr, PAGE_OFFSET, 1285 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 1286 1287 alias_cpa = *cpa; 1288 alias_cpa.vaddr = &laddr; 1289 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1290 1291 ret = __change_page_attr_set_clr(&alias_cpa, 0); 1292 if (ret) 1293 return ret; 1294 } 1295 1296#ifdef CONFIG_X86_64 1297 /* 1298 * If the primary call didn't touch the high mapping already 1299 * and the physical address is inside the kernel map, we need 1300 * to touch the high mapped kernel as well: 1301 */ 1302 if (!within(vaddr, (unsigned long)_text, _brk_end) && 1303 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { 1304 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + 1305 __START_KERNEL_map - phys_base; 1306 alias_cpa = *cpa; 1307 alias_cpa.vaddr = &temp_cpa_vaddr; 1308 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1309 1310 /* 1311 * The high mapping range is imprecise, so ignore the 1312 * return value. 1313 */ 1314 __change_page_attr_set_clr(&alias_cpa, 0); 1315 } 1316#endif 1317 1318 return 0; 1319} 1320 1321static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 1322{ 1323 int ret, numpages = cpa->numpages; 1324 1325 while (numpages) { 1326 /* 1327 * Store the remaining nr of pages for the large page 1328 * preservation check. 1329 */ 1330 cpa->numpages = numpages; 1331 /* for array changes, we can't use large page */ 1332 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1333 cpa->numpages = 1; 1334 1335 if (!debug_pagealloc) 1336 spin_lock(&cpa_lock); 1337 ret = __change_page_attr(cpa, checkalias); 1338 if (!debug_pagealloc) 1339 spin_unlock(&cpa_lock); 1340 if (ret) 1341 return ret; 1342 1343 if (checkalias) { 1344 ret = cpa_process_alias(cpa); 1345 if (ret) 1346 return ret; 1347 } 1348 1349 /* 1350 * Adjust the number of pages with the result of the 1351 * CPA operation. Either a large page has been 1352 * preserved or a single page update happened. 1353 */ 1354 BUG_ON(cpa->numpages > numpages || !cpa->numpages); 1355 numpages -= cpa->numpages; 1356 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) 1357 cpa->curpage++; 1358 else 1359 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 1360 1361 } 1362 return 0; 1363} 1364 1365static int change_page_attr_set_clr(unsigned long *addr, int numpages, 1366 pgprot_t mask_set, pgprot_t mask_clr, 1367 int force_split, int in_flag, 1368 struct page **pages) 1369{ 1370 struct cpa_data cpa; 1371 int ret, cache, checkalias; 1372 unsigned long baddr = 0; 1373 1374 memset(&cpa, 0, sizeof(cpa)); 1375 1376 /* 1377 * Check, if we are requested to change a not supported 1378 * feature: 1379 */ 1380 mask_set = canon_pgprot(mask_set); 1381 mask_clr = canon_pgprot(mask_clr); 1382 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) 1383 return 0; 1384 1385 /* Ensure we are PAGE_SIZE aligned */ 1386 if (in_flag & CPA_ARRAY) { 1387 int i; 1388 for (i = 0; i < numpages; i++) { 1389 if (addr[i] & ~PAGE_MASK) { 1390 addr[i] &= PAGE_MASK; 1391 WARN_ON_ONCE(1); 1392 } 1393 } 1394 } else if (!(in_flag & CPA_PAGES_ARRAY)) { 1395 /* 1396 * in_flag of CPA_PAGES_ARRAY implies it is aligned. 1397 * No need to cehck in that case 1398 */ 1399 if (*addr & ~PAGE_MASK) { 1400 *addr &= PAGE_MASK; 1401 /* 1402 * People should not be passing in unaligned addresses: 1403 */ 1404 WARN_ON_ONCE(1); 1405 } 1406 /* 1407 * Save address for cache flush. *addr is modified in the call 1408 * to __change_page_attr_set_clr() below. 1409 */ 1410 baddr = *addr; 1411 } 1412 1413 /* Must avoid aliasing mappings in the highmem code */ 1414 kmap_flush_unused(); 1415 1416 vm_unmap_aliases(); 1417 1418 cpa.vaddr = addr; 1419 cpa.pages = pages; 1420 cpa.numpages = numpages; 1421 cpa.mask_set = mask_set; 1422 cpa.mask_clr = mask_clr; 1423 cpa.flags = 0; 1424 cpa.curpage = 0; 1425 cpa.force_split = force_split; 1426 1427 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1428 cpa.flags |= in_flag; 1429 1430 /* No alias checking for _NX bit modifications */ 1431 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 1432 1433 ret = __change_page_attr_set_clr(&cpa, checkalias); 1434 1435 /* 1436 * Check whether we really changed something: 1437 */ 1438 if (!(cpa.flags & CPA_FLUSHTLB)) 1439 goto out; 1440 1441 /* 1442 * No need to flush, when we did not set any of the caching 1443 * attributes: 1444 */ 1445 cache = !!pgprot2cachemode(mask_set); 1446 1447 /* 1448 * On success we use CLFLUSH, when the CPU supports it to 1449 * avoid the WBINVD. If the CPU does not support it and in the 1450 * error case we fall back to cpa_flush_all (which uses 1451 * WBINVD): 1452 */ 1453 if (!ret && cpu_has_clflush) { 1454 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 1455 cpa_flush_array(addr, numpages, cache, 1456 cpa.flags, pages); 1457 } else 1458 cpa_flush_range(baddr, numpages, cache); 1459 } else 1460 cpa_flush_all(cache); 1461 1462out: 1463 return ret; 1464} 1465 1466static inline int change_page_attr_set(unsigned long *addr, int numpages, 1467 pgprot_t mask, int array) 1468{ 1469 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 1470 (array ? CPA_ARRAY : 0), NULL); 1471} 1472 1473static inline int change_page_attr_clear(unsigned long *addr, int numpages, 1474 pgprot_t mask, int array) 1475{ 1476 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 1477 (array ? CPA_ARRAY : 0), NULL); 1478} 1479 1480static inline int cpa_set_pages_array(struct page **pages, int numpages, 1481 pgprot_t mask) 1482{ 1483 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, 1484 CPA_PAGES_ARRAY, pages); 1485} 1486 1487static inline int cpa_clear_pages_array(struct page **pages, int numpages, 1488 pgprot_t mask) 1489{ 1490 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, 1491 CPA_PAGES_ARRAY, pages); 1492} 1493 1494int _set_memory_uc(unsigned long addr, int numpages) 1495{ 1496 /* 1497 * for now UC MINUS. see comments in ioremap_nocache() 1498 * If you really need strong UC use ioremap_uc(), but note 1499 * that you cannot override IO areas with set_memory_*() as 1500 * these helpers cannot work with IO memory. 1501 */ 1502 return change_page_attr_set(&addr, numpages, 1503 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1504 0); 1505} 1506 1507int set_memory_uc(unsigned long addr, int numpages) 1508{ 1509 int ret; 1510 1511 /* 1512 * for now UC MINUS. see comments in ioremap_nocache() 1513 */ 1514 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1515 _PAGE_CACHE_MODE_UC_MINUS, NULL); 1516 if (ret) 1517 goto out_err; 1518 1519 ret = _set_memory_uc(addr, numpages); 1520 if (ret) 1521 goto out_free; 1522 1523 return 0; 1524 1525out_free: 1526 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1527out_err: 1528 return ret; 1529} 1530EXPORT_SYMBOL(set_memory_uc); 1531 1532static int _set_memory_array(unsigned long *addr, int addrinarray, 1533 enum page_cache_mode new_type) 1534{ 1535 enum page_cache_mode set_type; 1536 int i, j; 1537 int ret; 1538 1539 for (i = 0; i < addrinarray; i++) { 1540 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 1541 new_type, NULL); 1542 if (ret) 1543 goto out_free; 1544 } 1545 1546 /* If WC, set to UC- first and then WC */ 1547 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 1548 _PAGE_CACHE_MODE_UC_MINUS : new_type; 1549 1550 ret = change_page_attr_set(addr, addrinarray, 1551 cachemode2pgprot(set_type), 1); 1552 1553 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1554 ret = change_page_attr_set_clr(addr, addrinarray, 1555 cachemode2pgprot( 1556 _PAGE_CACHE_MODE_WC), 1557 __pgprot(_PAGE_CACHE_MASK), 1558 0, CPA_ARRAY, NULL); 1559 if (ret) 1560 goto out_free; 1561 1562 return 0; 1563 1564out_free: 1565 for (j = 0; j < i; j++) 1566 free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); 1567 1568 return ret; 1569} 1570 1571int set_memory_array_uc(unsigned long *addr, int addrinarray) 1572{ 1573 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1574} 1575EXPORT_SYMBOL(set_memory_array_uc); 1576 1577int set_memory_array_wc(unsigned long *addr, int addrinarray) 1578{ 1579 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); 1580} 1581EXPORT_SYMBOL(set_memory_array_wc); 1582 1583int set_memory_array_wt(unsigned long *addr, int addrinarray) 1584{ 1585 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); 1586} 1587EXPORT_SYMBOL_GPL(set_memory_array_wt); 1588 1589int _set_memory_wc(unsigned long addr, int numpages) 1590{ 1591 int ret; 1592 unsigned long addr_copy = addr; 1593 1594 ret = change_page_attr_set(&addr, numpages, 1595 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1596 0); 1597 if (!ret) { 1598 ret = change_page_attr_set_clr(&addr_copy, numpages, 1599 cachemode2pgprot( 1600 _PAGE_CACHE_MODE_WC), 1601 __pgprot(_PAGE_CACHE_MASK), 1602 0, 0, NULL); 1603 } 1604 return ret; 1605} 1606 1607int set_memory_wc(unsigned long addr, int numpages) 1608{ 1609 int ret; 1610 1611 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1612 _PAGE_CACHE_MODE_WC, NULL); 1613 if (ret) 1614 return ret; 1615 1616 ret = _set_memory_wc(addr, numpages); 1617 if (ret) 1618 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1619 1620 return ret; 1621} 1622EXPORT_SYMBOL(set_memory_wc); 1623 1624int _set_memory_wt(unsigned long addr, int numpages) 1625{ 1626 return change_page_attr_set(&addr, numpages, 1627 cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); 1628} 1629 1630int set_memory_wt(unsigned long addr, int numpages) 1631{ 1632 int ret; 1633 1634 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1635 _PAGE_CACHE_MODE_WT, NULL); 1636 if (ret) 1637 return ret; 1638 1639 ret = _set_memory_wt(addr, numpages); 1640 if (ret) 1641 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1642 1643 return ret; 1644} 1645EXPORT_SYMBOL_GPL(set_memory_wt); 1646 1647int _set_memory_wb(unsigned long addr, int numpages) 1648{ 1649 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1650 return change_page_attr_clear(&addr, numpages, 1651 __pgprot(_PAGE_CACHE_MASK), 0); 1652} 1653 1654int set_memory_wb(unsigned long addr, int numpages) 1655{ 1656 int ret; 1657 1658 ret = _set_memory_wb(addr, numpages); 1659 if (ret) 1660 return ret; 1661 1662 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1663 return 0; 1664} 1665EXPORT_SYMBOL(set_memory_wb); 1666 1667int set_memory_array_wb(unsigned long *addr, int addrinarray) 1668{ 1669 int i; 1670 int ret; 1671 1672 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1673 ret = change_page_attr_clear(addr, addrinarray, 1674 __pgprot(_PAGE_CACHE_MASK), 1); 1675 if (ret) 1676 return ret; 1677 1678 for (i = 0; i < addrinarray; i++) 1679 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); 1680 1681 return 0; 1682} 1683EXPORT_SYMBOL(set_memory_array_wb); 1684 1685int set_memory_x(unsigned long addr, int numpages) 1686{ 1687 if (!(__supported_pte_mask & _PAGE_NX)) 1688 return 0; 1689 1690 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1691} 1692EXPORT_SYMBOL(set_memory_x); 1693 1694int set_memory_nx(unsigned long addr, int numpages) 1695{ 1696 if (!(__supported_pte_mask & _PAGE_NX)) 1697 return 0; 1698 1699 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1700} 1701EXPORT_SYMBOL(set_memory_nx); 1702 1703int set_memory_ro(unsigned long addr, int numpages) 1704{ 1705 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1706} 1707 1708int set_memory_rw(unsigned long addr, int numpages) 1709{ 1710 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1711} 1712 1713int set_memory_np(unsigned long addr, int numpages) 1714{ 1715 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1716} 1717 1718int set_memory_4k(unsigned long addr, int numpages) 1719{ 1720 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1721 __pgprot(0), 1, 0, NULL); 1722} 1723 1724int set_pages_uc(struct page *page, int numpages) 1725{ 1726 unsigned long addr = (unsigned long)page_address(page); 1727 1728 return set_memory_uc(addr, numpages); 1729} 1730EXPORT_SYMBOL(set_pages_uc); 1731 1732static int _set_pages_array(struct page **pages, int addrinarray, 1733 enum page_cache_mode new_type) 1734{ 1735 unsigned long start; 1736 unsigned long end; 1737 enum page_cache_mode set_type; 1738 int i; 1739 int free_idx; 1740 int ret; 1741 1742 for (i = 0; i < addrinarray; i++) { 1743 if (PageHighMem(pages[i])) 1744 continue; 1745 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1746 end = start + PAGE_SIZE; 1747 if (reserve_memtype(start, end, new_type, NULL)) 1748 goto err_out; 1749 } 1750 1751 /* If WC, set to UC- first and then WC */ 1752 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 1753 _PAGE_CACHE_MODE_UC_MINUS : new_type; 1754 1755 ret = cpa_set_pages_array(pages, addrinarray, 1756 cachemode2pgprot(set_type)); 1757 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1758 ret = change_page_attr_set_clr(NULL, addrinarray, 1759 cachemode2pgprot( 1760 _PAGE_CACHE_MODE_WC), 1761 __pgprot(_PAGE_CACHE_MASK), 1762 0, CPA_PAGES_ARRAY, pages); 1763 if (ret) 1764 goto err_out; 1765 return 0; /* Success */ 1766err_out: 1767 free_idx = i; 1768 for (i = 0; i < free_idx; i++) { 1769 if (PageHighMem(pages[i])) 1770 continue; 1771 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1772 end = start + PAGE_SIZE; 1773 free_memtype(start, end); 1774 } 1775 return -EINVAL; 1776} 1777 1778int set_pages_array_uc(struct page **pages, int addrinarray) 1779{ 1780 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1781} 1782EXPORT_SYMBOL(set_pages_array_uc); 1783 1784int set_pages_array_wc(struct page **pages, int addrinarray) 1785{ 1786 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); 1787} 1788EXPORT_SYMBOL(set_pages_array_wc); 1789 1790int set_pages_array_wt(struct page **pages, int addrinarray) 1791{ 1792 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); 1793} 1794EXPORT_SYMBOL_GPL(set_pages_array_wt); 1795 1796int set_pages_wb(struct page *page, int numpages) 1797{ 1798 unsigned long addr = (unsigned long)page_address(page); 1799 1800 return set_memory_wb(addr, numpages); 1801} 1802EXPORT_SYMBOL(set_pages_wb); 1803 1804int set_pages_array_wb(struct page **pages, int addrinarray) 1805{ 1806 int retval; 1807 unsigned long start; 1808 unsigned long end; 1809 int i; 1810 1811 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1812 retval = cpa_clear_pages_array(pages, addrinarray, 1813 __pgprot(_PAGE_CACHE_MASK)); 1814 if (retval) 1815 return retval; 1816 1817 for (i = 0; i < addrinarray; i++) { 1818 if (PageHighMem(pages[i])) 1819 continue; 1820 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1821 end = start + PAGE_SIZE; 1822 free_memtype(start, end); 1823 } 1824 1825 return 0; 1826} 1827EXPORT_SYMBOL(set_pages_array_wb); 1828 1829int set_pages_x(struct page *page, int numpages) 1830{ 1831 unsigned long addr = (unsigned long)page_address(page); 1832 1833 return set_memory_x(addr, numpages); 1834} 1835EXPORT_SYMBOL(set_pages_x); 1836 1837int set_pages_nx(struct page *page, int numpages) 1838{ 1839 unsigned long addr = (unsigned long)page_address(page); 1840 1841 return set_memory_nx(addr, numpages); 1842} 1843EXPORT_SYMBOL(set_pages_nx); 1844 1845int set_pages_ro(struct page *page, int numpages) 1846{ 1847 unsigned long addr = (unsigned long)page_address(page); 1848 1849 return set_memory_ro(addr, numpages); 1850} 1851 1852int set_pages_rw(struct page *page, int numpages) 1853{ 1854 unsigned long addr = (unsigned long)page_address(page); 1855 1856 return set_memory_rw(addr, numpages); 1857} 1858 1859#ifdef CONFIG_DEBUG_PAGEALLOC 1860 1861static int __set_pages_p(struct page *page, int numpages) 1862{ 1863 unsigned long tempaddr = (unsigned long) page_address(page); 1864 struct cpa_data cpa = { .vaddr = &tempaddr, 1865 .pgd = NULL, 1866 .numpages = numpages, 1867 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1868 .mask_clr = __pgprot(0), 1869 .flags = 0}; 1870 1871 /* 1872 * No alias checking needed for setting present flag. otherwise, 1873 * we may need to break large pages for 64-bit kernel text 1874 * mappings (this adds to complexity if we want to do this from 1875 * atomic context especially). Let's keep it simple! 1876 */ 1877 return __change_page_attr_set_clr(&cpa, 0); 1878} 1879 1880static int __set_pages_np(struct page *page, int numpages) 1881{ 1882 unsigned long tempaddr = (unsigned long) page_address(page); 1883 struct cpa_data cpa = { .vaddr = &tempaddr, 1884 .pgd = NULL, 1885 .numpages = numpages, 1886 .mask_set = __pgprot(0), 1887 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1888 .flags = 0}; 1889 1890 /* 1891 * No alias checking needed for setting not present flag. otherwise, 1892 * we may need to break large pages for 64-bit kernel text 1893 * mappings (this adds to complexity if we want to do this from 1894 * atomic context especially). Let's keep it simple! 1895 */ 1896 return __change_page_attr_set_clr(&cpa, 0); 1897} 1898 1899void __kernel_map_pages(struct page *page, int numpages, int enable) 1900{ 1901 if (PageHighMem(page)) 1902 return; 1903 if (!enable) { 1904 debug_check_no_locks_freed(page_address(page), 1905 numpages * PAGE_SIZE); 1906 } 1907 1908 /* 1909 * The return value is ignored as the calls cannot fail. 1910 * Large pages for identity mappings are not used at boot time 1911 * and hence no memory allocations during large page split. 1912 */ 1913 if (enable) 1914 __set_pages_p(page, numpages); 1915 else 1916 __set_pages_np(page, numpages); 1917 1918 /* 1919 * We should perform an IPI and flush all tlbs, 1920 * but that can deadlock->flush only current cpu: 1921 */ 1922 __flush_tlb_all(); 1923 1924 arch_flush_lazy_mmu_mode(); 1925} 1926 1927#ifdef CONFIG_HIBERNATION 1928 1929bool kernel_page_present(struct page *page) 1930{ 1931 unsigned int level; 1932 pte_t *pte; 1933 1934 if (PageHighMem(page)) 1935 return false; 1936 1937 pte = lookup_address((unsigned long)page_address(page), &level); 1938 return (pte_val(*pte) & _PAGE_PRESENT); 1939} 1940 1941#endif /* CONFIG_HIBERNATION */ 1942 1943#endif /* CONFIG_DEBUG_PAGEALLOC */ 1944 1945int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, 1946 unsigned numpages, unsigned long page_flags) 1947{ 1948 int retval = -EINVAL; 1949 1950 struct cpa_data cpa = { 1951 .vaddr = &address, 1952 .pfn = pfn, 1953 .pgd = pgd, 1954 .numpages = numpages, 1955 .mask_set = __pgprot(0), 1956 .mask_clr = __pgprot(0), 1957 .flags = 0, 1958 }; 1959 1960 if (!(__supported_pte_mask & _PAGE_NX)) 1961 goto out; 1962 1963 if (!(page_flags & _PAGE_NX)) 1964 cpa.mask_clr = __pgprot(_PAGE_NX); 1965 1966 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 1967 1968 retval = __change_page_attr_set_clr(&cpa, 0); 1969 __flush_tlb_all(); 1970 1971out: 1972 return retval; 1973} 1974 1975void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, 1976 unsigned numpages) 1977{ 1978 unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); 1979} 1980 1981/* 1982 * The testcases use internal knowledge of the implementation that shouldn't 1983 * be exposed to the rest of the kernel. Include these directly here. 1984 */ 1985#ifdef CONFIG_CPA_DEBUG 1986#include "pageattr-test.c" 1987#endif 1988