1/* 2 * Handle caching attributes in page tables (PAT) 3 * 4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 5 * Suresh B Siddha <suresh.b.siddha@intel.com> 6 * 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 8 */ 9 10#include <linux/seq_file.h> 11#include <linux/bootmem.h> 12#include <linux/debugfs.h> 13#include <linux/kernel.h> 14#include <linux/module.h> 15#include <linux/slab.h> 16#include <linux/mm.h> 17#include <linux/fs.h> 18#include <linux/rbtree.h> 19 20#include <asm/cacheflush.h> 21#include <asm/processor.h> 22#include <asm/tlbflush.h> 23#include <asm/x86_init.h> 24#include <asm/pgtable.h> 25#include <asm/fcntl.h> 26#include <asm/e820.h> 27#include <asm/mtrr.h> 28#include <asm/page.h> 29#include <asm/msr.h> 30#include <asm/pat.h> 31#include <asm/io.h> 32 33#include "pat_internal.h" 34#include "mm_internal.h" 35 36#undef pr_fmt 37#define pr_fmt(fmt) "" fmt 38 39static bool boot_cpu_done; 40 41static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); 42 43static inline void pat_disable(const char *reason) 44{ 45 __pat_enabled = 0; 46 pr_info("x86/PAT: %s\n", reason); 47} 48 49static int __init nopat(char *str) 50{ 51 pat_disable("PAT support disabled."); 52 return 0; 53} 54early_param("nopat", nopat); 55 56bool pat_enabled(void) 57{ 58 return !!__pat_enabled; 59} 60EXPORT_SYMBOL_GPL(pat_enabled); 61 62int pat_debug_enable; 63 64static int __init pat_debug_setup(char *str) 65{ 66 pat_debug_enable = 1; 67 return 0; 68} 69__setup("debugpat", pat_debug_setup); 70 71#ifdef CONFIG_X86_PAT 72/* 73 * X86 PAT uses page flags arch_1 and uncached together to keep track of 74 * memory type of pages that have backing page struct. 75 * 76 * X86 PAT supports 4 different memory types: 77 * - _PAGE_CACHE_MODE_WB 78 * - _PAGE_CACHE_MODE_WC 79 * - _PAGE_CACHE_MODE_UC_MINUS 80 * - _PAGE_CACHE_MODE_WT 81 * 82 * _PAGE_CACHE_MODE_WB is the default type. 83 */ 84 85#define _PGMT_WB 0 86#define _PGMT_WC (1UL << PG_arch_1) 87#define _PGMT_UC_MINUS (1UL << PG_uncached) 88#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 89#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 90#define _PGMT_CLEAR_MASK (~_PGMT_MASK) 91 92static inline enum page_cache_mode get_page_memtype(struct page *pg) 93{ 94 unsigned long pg_flags = pg->flags & _PGMT_MASK; 95 96 if (pg_flags == _PGMT_WB) 97 return _PAGE_CACHE_MODE_WB; 98 else if (pg_flags == _PGMT_WC) 99 return _PAGE_CACHE_MODE_WC; 100 else if (pg_flags == _PGMT_UC_MINUS) 101 return _PAGE_CACHE_MODE_UC_MINUS; 102 else 103 return _PAGE_CACHE_MODE_WT; 104} 105 106static inline void set_page_memtype(struct page *pg, 107 enum page_cache_mode memtype) 108{ 109 unsigned long memtype_flags; 110 unsigned long old_flags; 111 unsigned long new_flags; 112 113 switch (memtype) { 114 case _PAGE_CACHE_MODE_WC: 115 memtype_flags = _PGMT_WC; 116 break; 117 case _PAGE_CACHE_MODE_UC_MINUS: 118 memtype_flags = _PGMT_UC_MINUS; 119 break; 120 case _PAGE_CACHE_MODE_WT: 121 memtype_flags = _PGMT_WT; 122 break; 123 case _PAGE_CACHE_MODE_WB: 124 default: 125 memtype_flags = _PGMT_WB; 126 break; 127 } 128 129 do { 130 old_flags = pg->flags; 131 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 132 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 133} 134#else 135static inline enum page_cache_mode get_page_memtype(struct page *pg) 136{ 137 return -1; 138} 139static inline void set_page_memtype(struct page *pg, 140 enum page_cache_mode memtype) 141{ 142} 143#endif 144 145enum { 146 PAT_UC = 0, /* uncached */ 147 PAT_WC = 1, /* Write combining */ 148 PAT_WT = 4, /* Write Through */ 149 PAT_WP = 5, /* Write Protected */ 150 PAT_WB = 6, /* Write Back (default) */ 151 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 152}; 153 154#define CM(c) (_PAGE_CACHE_MODE_ ## c) 155 156static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 157{ 158 enum page_cache_mode cache; 159 char *cache_mode; 160 161 switch (pat_val) { 162 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 163 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 164 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 165 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 166 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 167 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 168 default: cache = CM(WB); cache_mode = "WB "; break; 169 } 170 171 memcpy(msg, cache_mode, 4); 172 173 return cache; 174} 175 176#undef CM 177 178/* 179 * Update the cache mode to pgprot translation tables according to PAT 180 * configuration. 181 * Using lower indices is preferred, so we start with highest index. 182 */ 183void pat_init_cache_modes(u64 pat) 184{ 185 enum page_cache_mode cache; 186 char pat_msg[33]; 187 int i; 188 189 pat_msg[32] = 0; 190 for (i = 7; i >= 0; i--) { 191 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 192 pat_msg + 4 * i); 193 update_cache_mode_entry(i, cache); 194 } 195 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 196} 197 198#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 199 200static void pat_bsp_init(u64 pat) 201{ 202 u64 tmp_pat; 203 204 if (!cpu_has_pat) { 205 pat_disable("PAT not supported by CPU."); 206 return; 207 } 208 209 if (!pat_enabled()) 210 goto done; 211 212 rdmsrl(MSR_IA32_CR_PAT, tmp_pat); 213 if (!tmp_pat) { 214 pat_disable("PAT MSR is 0, disabled."); 215 return; 216 } 217 218 wrmsrl(MSR_IA32_CR_PAT, pat); 219 220done: 221 pat_init_cache_modes(pat); 222} 223 224static void pat_ap_init(u64 pat) 225{ 226 if (!pat_enabled()) 227 return; 228 229 if (!cpu_has_pat) { 230 /* 231 * If this happens we are on a secondary CPU, but switched to 232 * PAT on the boot CPU. We have no way to undo PAT. 233 */ 234 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 235 } 236 237 wrmsrl(MSR_IA32_CR_PAT, pat); 238} 239 240void pat_init(void) 241{ 242 u64 pat; 243 struct cpuinfo_x86 *c = &boot_cpu_data; 244 245 if (!pat_enabled()) { 246 /* 247 * No PAT. Emulate the PAT table that corresponds to the two 248 * cache bits, PWT (Write Through) and PCD (Cache Disable). This 249 * setup is the same as the BIOS default setup when the system 250 * has PAT but the "nopat" boot option has been specified. This 251 * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. 252 * 253 * PTE encoding: 254 * 255 * PCD 256 * |PWT PAT 257 * || slot 258 * 00 0 WB : _PAGE_CACHE_MODE_WB 259 * 01 1 WT : _PAGE_CACHE_MODE_WT 260 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 261 * 11 3 UC : _PAGE_CACHE_MODE_UC 262 * 263 * NOTE: When WC or WP is used, it is redirected to UC- per 264 * the default setup in __cachemode2pte_tbl[]. 265 */ 266 pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | 267 PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); 268 269 } else if ((c->x86_vendor == X86_VENDOR_INTEL) && 270 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 271 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 272 /* 273 * PAT support with the lower four entries. Intel Pentium 2, 274 * 3, M, and 4 are affected by PAT errata, which makes the 275 * upper four entries unusable. To be on the safe side, we don't 276 * use those. 277 * 278 * PTE encoding: 279 * PAT 280 * |PCD 281 * ||PWT PAT 282 * ||| slot 283 * 000 0 WB : _PAGE_CACHE_MODE_WB 284 * 001 1 WC : _PAGE_CACHE_MODE_WC 285 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 286 * 011 3 UC : _PAGE_CACHE_MODE_UC 287 * PAT bit unused 288 * 289 * NOTE: When WT or WP is used, it is redirected to UC- per 290 * the default setup in __cachemode2pte_tbl[]. 291 */ 292 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 293 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 294 } else { 295 /* 296 * Full PAT support. We put WT in slot 7 to improve 297 * robustness in the presence of errata that might cause 298 * the high PAT bit to be ignored. This way, a buggy slot 7 299 * access will hit slot 3, and slot 3 is UC, so at worst 300 * we lose performance without causing a correctness issue. 301 * Pentium 4 erratum N46 is an example for such an erratum, 302 * although we try not to use PAT at all on affected CPUs. 303 * 304 * PTE encoding: 305 * PAT 306 * |PCD 307 * ||PWT PAT 308 * ||| slot 309 * 000 0 WB : _PAGE_CACHE_MODE_WB 310 * 001 1 WC : _PAGE_CACHE_MODE_WC 311 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 312 * 011 3 UC : _PAGE_CACHE_MODE_UC 313 * 100 4 WB : Reserved 314 * 101 5 WC : Reserved 315 * 110 6 UC-: Reserved 316 * 111 7 WT : _PAGE_CACHE_MODE_WT 317 * 318 * The reserved slots are unused, but mapped to their 319 * corresponding types in the presence of PAT errata. 320 */ 321 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 322 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); 323 } 324 325 if (!boot_cpu_done) { 326 pat_bsp_init(pat); 327 boot_cpu_done = true; 328 } else { 329 pat_ap_init(pat); 330 } 331} 332 333#undef PAT 334 335static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 336 337/* 338 * Does intersection of PAT memory type and MTRR memory type and returns 339 * the resulting memory type as PAT understands it. 340 * (Type in pat and mtrr will not have same value) 341 * The intersection is based on "Effective Memory Type" tables in IA-32 342 * SDM vol 3a 343 */ 344static unsigned long pat_x_mtrr_type(u64 start, u64 end, 345 enum page_cache_mode req_type) 346{ 347 /* 348 * Look for MTRR hint to get the effective type in case where PAT 349 * request is for WB. 350 */ 351 if (req_type == _PAGE_CACHE_MODE_WB) { 352 u8 mtrr_type, uniform; 353 354 mtrr_type = mtrr_type_lookup(start, end, &uniform); 355 if (mtrr_type != MTRR_TYPE_WRBACK) 356 return _PAGE_CACHE_MODE_UC_MINUS; 357 358 return _PAGE_CACHE_MODE_WB; 359 } 360 361 return req_type; 362} 363 364struct pagerange_state { 365 unsigned long cur_pfn; 366 int ram; 367 int not_ram; 368}; 369 370static int 371pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 372{ 373 struct pagerange_state *state = arg; 374 375 state->not_ram |= initial_pfn > state->cur_pfn; 376 state->ram |= total_nr_pages > 0; 377 state->cur_pfn = initial_pfn + total_nr_pages; 378 379 return state->ram && state->not_ram; 380} 381 382static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 383{ 384 int ret = 0; 385 unsigned long start_pfn = start >> PAGE_SHIFT; 386 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 387 struct pagerange_state state = {start_pfn, 0, 0}; 388 389 /* 390 * For legacy reasons, physical address range in the legacy ISA 391 * region is tracked as non-RAM. This will allow users of 392 * /dev/mem to map portions of legacy ISA region, even when 393 * some of those portions are listed(or not even listed) with 394 * different e820 types(RAM/reserved/..) 395 */ 396 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 397 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 398 399 if (start_pfn < end_pfn) { 400 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 401 &state, pagerange_is_ram_callback); 402 } 403 404 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 405} 406 407/* 408 * For RAM pages, we use page flags to mark the pages with appropriate type. 409 * The page flags are limited to four types, WB (default), WC, WT and UC-. 410 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 411 * a new memory type is only allowed for a page mapped with the default WB 412 * type. 413 * 414 * Here we do two passes: 415 * - Find the memtype of all the pages in the range, look for any conflicts. 416 * - In case of no conflicts, set the new memtype for pages in the range. 417 */ 418static int reserve_ram_pages_type(u64 start, u64 end, 419 enum page_cache_mode req_type, 420 enum page_cache_mode *new_type) 421{ 422 struct page *page; 423 u64 pfn; 424 425 if (req_type == _PAGE_CACHE_MODE_WP) { 426 if (new_type) 427 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 428 return -EINVAL; 429 } 430 431 if (req_type == _PAGE_CACHE_MODE_UC) { 432 /* We do not support strong UC */ 433 WARN_ON_ONCE(1); 434 req_type = _PAGE_CACHE_MODE_UC_MINUS; 435 } 436 437 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 438 enum page_cache_mode type; 439 440 page = pfn_to_page(pfn); 441 type = get_page_memtype(page); 442 if (type != _PAGE_CACHE_MODE_WB) { 443 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 444 start, end - 1, type, req_type); 445 if (new_type) 446 *new_type = type; 447 448 return -EBUSY; 449 } 450 } 451 452 if (new_type) 453 *new_type = req_type; 454 455 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 456 page = pfn_to_page(pfn); 457 set_page_memtype(page, req_type); 458 } 459 return 0; 460} 461 462static int free_ram_pages_type(u64 start, u64 end) 463{ 464 struct page *page; 465 u64 pfn; 466 467 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 468 page = pfn_to_page(pfn); 469 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 470 } 471 return 0; 472} 473 474/* 475 * req_type typically has one of the: 476 * - _PAGE_CACHE_MODE_WB 477 * - _PAGE_CACHE_MODE_WC 478 * - _PAGE_CACHE_MODE_UC_MINUS 479 * - _PAGE_CACHE_MODE_UC 480 * - _PAGE_CACHE_MODE_WT 481 * 482 * If new_type is NULL, function will return an error if it cannot reserve the 483 * region with req_type. If new_type is non-NULL, function will return 484 * available type in new_type in case of no error. In case of any error 485 * it will return a negative return value. 486 */ 487int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, 488 enum page_cache_mode *new_type) 489{ 490 struct memtype *new; 491 enum page_cache_mode actual_type; 492 int is_range_ram; 493 int err = 0; 494 495 BUG_ON(start >= end); /* end is exclusive */ 496 497 if (!pat_enabled()) { 498 /* This is identical to page table setting without PAT */ 499 if (new_type) 500 *new_type = req_type; 501 return 0; 502 } 503 504 /* Low ISA region is always mapped WB in page table. No need to track */ 505 if (x86_platform.is_untracked_pat_range(start, end)) { 506 if (new_type) 507 *new_type = _PAGE_CACHE_MODE_WB; 508 return 0; 509 } 510 511 /* 512 * Call mtrr_lookup to get the type hint. This is an 513 * optimization for /dev/mem mmap'ers into WB memory (BIOS 514 * tools and ACPI tools). Use WB request for WB memory and use 515 * UC_MINUS otherwise. 516 */ 517 actual_type = pat_x_mtrr_type(start, end, req_type); 518 519 if (new_type) 520 *new_type = actual_type; 521 522 is_range_ram = pat_pagerange_is_ram(start, end); 523 if (is_range_ram == 1) { 524 525 err = reserve_ram_pages_type(start, end, req_type, new_type); 526 527 return err; 528 } else if (is_range_ram < 0) { 529 return -EINVAL; 530 } 531 532 new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 533 if (!new) 534 return -ENOMEM; 535 536 new->start = start; 537 new->end = end; 538 new->type = actual_type; 539 540 spin_lock(&memtype_lock); 541 542 err = rbt_memtype_check_insert(new, new_type); 543 if (err) { 544 pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 545 start, end - 1, 546 cattr_name(new->type), cattr_name(req_type)); 547 kfree(new); 548 spin_unlock(&memtype_lock); 549 550 return err; 551 } 552 553 spin_unlock(&memtype_lock); 554 555 dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 556 start, end - 1, cattr_name(new->type), cattr_name(req_type), 557 new_type ? cattr_name(*new_type) : "-"); 558 559 return err; 560} 561 562int free_memtype(u64 start, u64 end) 563{ 564 int err = -EINVAL; 565 int is_range_ram; 566 struct memtype *entry; 567 568 if (!pat_enabled()) 569 return 0; 570 571 /* Low ISA region is always mapped WB. No need to track */ 572 if (x86_platform.is_untracked_pat_range(start, end)) 573 return 0; 574 575 is_range_ram = pat_pagerange_is_ram(start, end); 576 if (is_range_ram == 1) { 577 578 err = free_ram_pages_type(start, end); 579 580 return err; 581 } else if (is_range_ram < 0) { 582 return -EINVAL; 583 } 584 585 spin_lock(&memtype_lock); 586 entry = rbt_memtype_erase(start, end); 587 spin_unlock(&memtype_lock); 588 589 if (!entry) { 590 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 591 current->comm, current->pid, start, end - 1); 592 return -EINVAL; 593 } 594 595 kfree(entry); 596 597 dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); 598 599 return 0; 600} 601 602 603/** 604 * lookup_memtype - Looksup the memory type for a physical address 605 * @paddr: physical address of which memory type needs to be looked up 606 * 607 * Only to be called when PAT is enabled 608 * 609 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 610 * or _PAGE_CACHE_MODE_WT. 611 */ 612static enum page_cache_mode lookup_memtype(u64 paddr) 613{ 614 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 615 struct memtype *entry; 616 617 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 618 return rettype; 619 620 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 621 struct page *page; 622 623 page = pfn_to_page(paddr >> PAGE_SHIFT); 624 return get_page_memtype(page); 625 } 626 627 spin_lock(&memtype_lock); 628 629 entry = rbt_memtype_lookup(paddr); 630 if (entry != NULL) 631 rettype = entry->type; 632 else 633 rettype = _PAGE_CACHE_MODE_UC_MINUS; 634 635 spin_unlock(&memtype_lock); 636 return rettype; 637} 638 639/** 640 * io_reserve_memtype - Request a memory type mapping for a region of memory 641 * @start: start (physical address) of the region 642 * @end: end (physical address) of the region 643 * @type: A pointer to memtype, with requested type. On success, requested 644 * or any other compatible type that was available for the region is returned 645 * 646 * On success, returns 0 647 * On failure, returns non-zero 648 */ 649int io_reserve_memtype(resource_size_t start, resource_size_t end, 650 enum page_cache_mode *type) 651{ 652 resource_size_t size = end - start; 653 enum page_cache_mode req_type = *type; 654 enum page_cache_mode new_type; 655 int ret; 656 657 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 658 659 ret = reserve_memtype(start, end, req_type, &new_type); 660 if (ret) 661 goto out_err; 662 663 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 664 goto out_free; 665 666 if (kernel_map_sync_memtype(start, size, new_type) < 0) 667 goto out_free; 668 669 *type = new_type; 670 return 0; 671 672out_free: 673 free_memtype(start, end); 674 ret = -EBUSY; 675out_err: 676 return ret; 677} 678 679/** 680 * io_free_memtype - Release a memory type mapping for a region of memory 681 * @start: start (physical address) of the region 682 * @end: end (physical address) of the region 683 */ 684void io_free_memtype(resource_size_t start, resource_size_t end) 685{ 686 free_memtype(start, end); 687} 688 689pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 690 unsigned long size, pgprot_t vma_prot) 691{ 692 return vma_prot; 693} 694 695#ifdef CONFIG_STRICT_DEVMEM 696/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 697static inline int range_is_allowed(unsigned long pfn, unsigned long size) 698{ 699 return 1; 700} 701#else 702/* This check is needed to avoid cache aliasing when PAT is enabled */ 703static inline int range_is_allowed(unsigned long pfn, unsigned long size) 704{ 705 u64 from = ((u64)pfn) << PAGE_SHIFT; 706 u64 to = from + size; 707 u64 cursor = from; 708 709 if (!pat_enabled()) 710 return 1; 711 712 while (cursor < to) { 713 if (!devmem_is_allowed(pfn)) { 714 pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", 715 current->comm, from, to - 1); 716 return 0; 717 } 718 cursor += PAGE_SIZE; 719 pfn++; 720 } 721 return 1; 722} 723#endif /* CONFIG_STRICT_DEVMEM */ 724 725int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 726 unsigned long size, pgprot_t *vma_prot) 727{ 728 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 729 730 if (!range_is_allowed(pfn, size)) 731 return 0; 732 733 if (file->f_flags & O_DSYNC) 734 pcm = _PAGE_CACHE_MODE_UC_MINUS; 735 736#ifdef CONFIG_X86_32 737 /* 738 * On the PPro and successors, the MTRRs are used to set 739 * memory types for physical addresses outside main memory, 740 * so blindly setting UC or PWT on those pages is wrong. 741 * For Pentiums and earlier, the surround logic should disable 742 * caching for the high addresses through the KEN pin, but 743 * we maintain the tradition of paranoia in this code. 744 */ 745 if (!pat_enabled() && 746 !(boot_cpu_has(X86_FEATURE_MTRR) || 747 boot_cpu_has(X86_FEATURE_K6_MTRR) || 748 boot_cpu_has(X86_FEATURE_CYRIX_ARR) || 749 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && 750 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 751 pcm = _PAGE_CACHE_MODE_UC; 752 } 753#endif 754 755 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 756 cachemode2protval(pcm)); 757 return 1; 758} 759 760/* 761 * Change the memory type for the physial address range in kernel identity 762 * mapping space if that range is a part of identity map. 763 */ 764int kernel_map_sync_memtype(u64 base, unsigned long size, 765 enum page_cache_mode pcm) 766{ 767 unsigned long id_sz; 768 769 if (base > __pa(high_memory-1)) 770 return 0; 771 772 /* 773 * some areas in the middle of the kernel identity range 774 * are not mapped, like the PCI space. 775 */ 776 if (!page_is_ram(base >> PAGE_SHIFT)) 777 return 0; 778 779 id_sz = (__pa(high_memory-1) <= base + size) ? 780 __pa(high_memory) - base : 781 size; 782 783 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 784 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 785 current->comm, current->pid, 786 cattr_name(pcm), 787 base, (unsigned long long)(base + size-1)); 788 return -EINVAL; 789 } 790 return 0; 791} 792 793/* 794 * Internal interface to reserve a range of physical memory with prot. 795 * Reserved non RAM regions only and after successful reserve_memtype, 796 * this func also keeps identity mapping (if any) in sync with this new prot. 797 */ 798static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 799 int strict_prot) 800{ 801 int is_ram = 0; 802 int ret; 803 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 804 enum page_cache_mode pcm = want_pcm; 805 806 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 807 808 /* 809 * reserve_pfn_range() for RAM pages. We do not refcount to keep 810 * track of number of mappings of RAM pages. We can assert that 811 * the type requested matches the type of first page in the range. 812 */ 813 if (is_ram) { 814 if (!pat_enabled()) 815 return 0; 816 817 pcm = lookup_memtype(paddr); 818 if (want_pcm != pcm) { 819 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 820 current->comm, current->pid, 821 cattr_name(want_pcm), 822 (unsigned long long)paddr, 823 (unsigned long long)(paddr + size - 1), 824 cattr_name(pcm)); 825 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 826 (~_PAGE_CACHE_MASK)) | 827 cachemode2protval(pcm)); 828 } 829 return 0; 830 } 831 832 ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); 833 if (ret) 834 return ret; 835 836 if (pcm != want_pcm) { 837 if (strict_prot || 838 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 839 free_memtype(paddr, paddr + size); 840 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 841 current->comm, current->pid, 842 cattr_name(want_pcm), 843 (unsigned long long)paddr, 844 (unsigned long long)(paddr + size - 1), 845 cattr_name(pcm)); 846 return -EINVAL; 847 } 848 /* 849 * We allow returning different type than the one requested in 850 * non strict case. 851 */ 852 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 853 (~_PAGE_CACHE_MASK)) | 854 cachemode2protval(pcm)); 855 } 856 857 if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { 858 free_memtype(paddr, paddr + size); 859 return -EINVAL; 860 } 861 return 0; 862} 863 864/* 865 * Internal interface to free a range of physical memory. 866 * Frees non RAM regions only. 867 */ 868static void free_pfn_range(u64 paddr, unsigned long size) 869{ 870 int is_ram; 871 872 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 873 if (is_ram == 0) 874 free_memtype(paddr, paddr + size); 875} 876 877/* 878 * track_pfn_copy is called when vma that is covering the pfnmap gets 879 * copied through copy_page_range(). 880 * 881 * If the vma has a linear pfn mapping for the entire range, we get the prot 882 * from pte and reserve the entire vma range with single reserve_pfn_range call. 883 */ 884int track_pfn_copy(struct vm_area_struct *vma) 885{ 886 resource_size_t paddr; 887 unsigned long prot; 888 unsigned long vma_size = vma->vm_end - vma->vm_start; 889 pgprot_t pgprot; 890 891 if (vma->vm_flags & VM_PAT) { 892 /* 893 * reserve the whole chunk covered by vma. We need the 894 * starting address and protection from pte. 895 */ 896 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 897 WARN_ON_ONCE(1); 898 return -EINVAL; 899 } 900 pgprot = __pgprot(prot); 901 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 902 } 903 904 return 0; 905} 906 907/* 908 * prot is passed in as a parameter for the new mapping. If the vma has a 909 * linear pfn mapping for the entire range reserve the entire vma range with 910 * single reserve_pfn_range call. 911 */ 912int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 913 unsigned long pfn, unsigned long addr, unsigned long size) 914{ 915 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 916 enum page_cache_mode pcm; 917 918 /* reserve the whole chunk starting from paddr */ 919 if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { 920 int ret; 921 922 ret = reserve_pfn_range(paddr, size, prot, 0); 923 if (!ret) 924 vma->vm_flags |= VM_PAT; 925 return ret; 926 } 927 928 if (!pat_enabled()) 929 return 0; 930 931 /* 932 * For anything smaller than the vma size we set prot based on the 933 * lookup. 934 */ 935 pcm = lookup_memtype(paddr); 936 937 /* Check memtype for the remaining pages */ 938 while (size > PAGE_SIZE) { 939 size -= PAGE_SIZE; 940 paddr += PAGE_SIZE; 941 if (pcm != lookup_memtype(paddr)) 942 return -EINVAL; 943 } 944 945 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 946 cachemode2protval(pcm)); 947 948 return 0; 949} 950 951int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 952 unsigned long pfn) 953{ 954 enum page_cache_mode pcm; 955 956 if (!pat_enabled()) 957 return 0; 958 959 /* Set prot based on lookup */ 960 pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); 961 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 962 cachemode2protval(pcm)); 963 964 return 0; 965} 966 967/* 968 * untrack_pfn is called while unmapping a pfnmap for a region. 969 * untrack can be called for a specific region indicated by pfn and size or 970 * can be for the entire vma (in which case pfn, size are zero). 971 */ 972void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 973 unsigned long size) 974{ 975 resource_size_t paddr; 976 unsigned long prot; 977 978 if (!(vma->vm_flags & VM_PAT)) 979 return; 980 981 /* free the chunk starting from pfn or the whole chunk */ 982 paddr = (resource_size_t)pfn << PAGE_SHIFT; 983 if (!paddr && !size) { 984 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 985 WARN_ON_ONCE(1); 986 return; 987 } 988 989 size = vma->vm_end - vma->vm_start; 990 } 991 free_pfn_range(paddr, size); 992 vma->vm_flags &= ~VM_PAT; 993} 994 995pgprot_t pgprot_writecombine(pgprot_t prot) 996{ 997 return __pgprot(pgprot_val(prot) | 998 cachemode2protval(_PAGE_CACHE_MODE_WC)); 999} 1000EXPORT_SYMBOL_GPL(pgprot_writecombine); 1001 1002pgprot_t pgprot_writethrough(pgprot_t prot) 1003{ 1004 return __pgprot(pgprot_val(prot) | 1005 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1006} 1007EXPORT_SYMBOL_GPL(pgprot_writethrough); 1008 1009#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1010 1011static struct memtype *memtype_get_idx(loff_t pos) 1012{ 1013 struct memtype *print_entry; 1014 int ret; 1015 1016 print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1017 if (!print_entry) 1018 return NULL; 1019 1020 spin_lock(&memtype_lock); 1021 ret = rbt_memtype_copy_nth_element(print_entry, pos); 1022 spin_unlock(&memtype_lock); 1023 1024 if (!ret) { 1025 return print_entry; 1026 } else { 1027 kfree(print_entry); 1028 return NULL; 1029 } 1030} 1031 1032static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1033{ 1034 if (*pos == 0) { 1035 ++*pos; 1036 seq_puts(seq, "PAT memtype list:\n"); 1037 } 1038 1039 return memtype_get_idx(*pos); 1040} 1041 1042static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1043{ 1044 ++*pos; 1045 return memtype_get_idx(*pos); 1046} 1047 1048static void memtype_seq_stop(struct seq_file *seq, void *v) 1049{ 1050} 1051 1052static int memtype_seq_show(struct seq_file *seq, void *v) 1053{ 1054 struct memtype *print_entry = (struct memtype *)v; 1055 1056 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 1057 print_entry->start, print_entry->end); 1058 kfree(print_entry); 1059 1060 return 0; 1061} 1062 1063static const struct seq_operations memtype_seq_ops = { 1064 .start = memtype_seq_start, 1065 .next = memtype_seq_next, 1066 .stop = memtype_seq_stop, 1067 .show = memtype_seq_show, 1068}; 1069 1070static int memtype_seq_open(struct inode *inode, struct file *file) 1071{ 1072 return seq_open(file, &memtype_seq_ops); 1073} 1074 1075static const struct file_operations memtype_fops = { 1076 .open = memtype_seq_open, 1077 .read = seq_read, 1078 .llseek = seq_lseek, 1079 .release = seq_release, 1080}; 1081 1082static int __init pat_memtype_list_init(void) 1083{ 1084 if (pat_enabled()) { 1085 debugfs_create_file("pat_memtype_list", S_IRUSR, 1086 arch_debugfs_dir, NULL, &memtype_fops); 1087 } 1088 return 0; 1089} 1090 1091late_initcall(pat_memtype_list_init); 1092 1093#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1094