1/* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13#include <linux/module.h> 14#include <linux/init.h> 15#include <linux/topology.h> 16#include <linux/capability.h> 17#include <linux/device.h> 18#include <linux/memory.h> 19#include <linux/memory_hotplug.h> 20#include <linux/mm.h> 21#include <linux/mutex.h> 22#include <linux/stat.h> 23#include <linux/slab.h> 24 25#include <linux/atomic.h> 26#include <asm/uaccess.h> 27 28static DEFINE_MUTEX(mem_sysfs_mutex); 29 30#define MEMORY_CLASS_NAME "memory" 31 32#define to_memory_block(dev) container_of(dev, struct memory_block, dev) 33 34static int sections_per_block; 35 36static inline int base_memory_block_id(int section_nr) 37{ 38 return section_nr / sections_per_block; 39} 40 41static int memory_subsys_online(struct device *dev); 42static int memory_subsys_offline(struct device *dev); 43 44static struct bus_type memory_subsys = { 45 .name = MEMORY_CLASS_NAME, 46 .dev_name = MEMORY_CLASS_NAME, 47 .online = memory_subsys_online, 48 .offline = memory_subsys_offline, 49}; 50 51static BLOCKING_NOTIFIER_HEAD(memory_chain); 52 53int register_memory_notifier(struct notifier_block *nb) 54{ 55 return blocking_notifier_chain_register(&memory_chain, nb); 56} 57EXPORT_SYMBOL(register_memory_notifier); 58 59void unregister_memory_notifier(struct notifier_block *nb) 60{ 61 blocking_notifier_chain_unregister(&memory_chain, nb); 62} 63EXPORT_SYMBOL(unregister_memory_notifier); 64 65static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 66 67int register_memory_isolate_notifier(struct notifier_block *nb) 68{ 69 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 70} 71EXPORT_SYMBOL(register_memory_isolate_notifier); 72 73void unregister_memory_isolate_notifier(struct notifier_block *nb) 74{ 75 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 76} 77EXPORT_SYMBOL(unregister_memory_isolate_notifier); 78 79static void memory_block_release(struct device *dev) 80{ 81 struct memory_block *mem = to_memory_block(dev); 82 83 kfree(mem); 84} 85 86unsigned long __weak memory_block_size_bytes(void) 87{ 88 return MIN_MEMORY_BLOCK_SIZE; 89} 90 91static unsigned long get_memory_block_size(void) 92{ 93 unsigned long block_sz; 94 95 block_sz = memory_block_size_bytes(); 96 97 /* Validate blk_sz is a power of 2 and not less than section size */ 98 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 99 WARN_ON(1); 100 block_sz = MIN_MEMORY_BLOCK_SIZE; 101 } 102 103 return block_sz; 104} 105 106/* 107 * use this as the physical section index that this memsection 108 * uses. 109 */ 110 111static ssize_t show_mem_start_phys_index(struct device *dev, 112 struct device_attribute *attr, char *buf) 113{ 114 struct memory_block *mem = to_memory_block(dev); 115 unsigned long phys_index; 116 117 phys_index = mem->start_section_nr / sections_per_block; 118 return sprintf(buf, "%08lx\n", phys_index); 119} 120 121/* 122 * Show whether the section of memory is likely to be hot-removable 123 */ 124static ssize_t show_mem_removable(struct device *dev, 125 struct device_attribute *attr, char *buf) 126{ 127 unsigned long i, pfn; 128 int ret = 1; 129 struct memory_block *mem = to_memory_block(dev); 130 131 for (i = 0; i < sections_per_block; i++) { 132 if (!present_section_nr(mem->start_section_nr + i)) 133 continue; 134 pfn = section_nr_to_pfn(mem->start_section_nr + i); 135 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 136 } 137 138 return sprintf(buf, "%d\n", ret); 139} 140 141/* 142 * online, offline, going offline, etc. 143 */ 144static ssize_t show_mem_state(struct device *dev, 145 struct device_attribute *attr, char *buf) 146{ 147 struct memory_block *mem = to_memory_block(dev); 148 ssize_t len = 0; 149 150 /* 151 * We can probably put these states in a nice little array 152 * so that they're not open-coded 153 */ 154 switch (mem->state) { 155 case MEM_ONLINE: 156 len = sprintf(buf, "online\n"); 157 break; 158 case MEM_OFFLINE: 159 len = sprintf(buf, "offline\n"); 160 break; 161 case MEM_GOING_OFFLINE: 162 len = sprintf(buf, "going-offline\n"); 163 break; 164 default: 165 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 166 mem->state); 167 WARN_ON(1); 168 break; 169 } 170 171 return len; 172} 173 174int memory_notify(unsigned long val, void *v) 175{ 176 return blocking_notifier_call_chain(&memory_chain, val, v); 177} 178 179int memory_isolate_notify(unsigned long val, void *v) 180{ 181 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 182} 183 184/* 185 * The probe routines leave the pages reserved, just as the bootmem code does. 186 * Make sure they're still that way. 187 */ 188static bool pages_correctly_reserved(unsigned long start_pfn) 189{ 190 int i, j; 191 struct page *page; 192 unsigned long pfn = start_pfn; 193 194 /* 195 * memmap between sections is not contiguous except with 196 * SPARSEMEM_VMEMMAP. We lookup the page once per section 197 * and assume memmap is contiguous within each section 198 */ 199 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 200 if (WARN_ON_ONCE(!pfn_valid(pfn))) 201 return false; 202 page = pfn_to_page(pfn); 203 204 for (j = 0; j < PAGES_PER_SECTION; j++) { 205 if (PageReserved(page + j)) 206 continue; 207 208 printk(KERN_WARNING "section number %ld page number %d " 209 "not reserved, was it already online?\n", 210 pfn_to_section_nr(pfn), j); 211 212 return false; 213 } 214 } 215 216 return true; 217} 218 219/* 220 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 221 * OK to have direct references to sparsemem variables in here. 222 * Must already be protected by mem_hotplug_begin(). 223 */ 224static int 225memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 226{ 227 unsigned long start_pfn; 228 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 229 struct page *first_page; 230 int ret; 231 232 start_pfn = section_nr_to_pfn(phys_index); 233 first_page = pfn_to_page(start_pfn); 234 235 switch (action) { 236 case MEM_ONLINE: 237 if (!pages_correctly_reserved(start_pfn)) 238 return -EBUSY; 239 240 ret = online_pages(start_pfn, nr_pages, online_type); 241 break; 242 case MEM_OFFLINE: 243 ret = offline_pages(start_pfn, nr_pages); 244 break; 245 default: 246 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 247 "%ld\n", __func__, phys_index, action, action); 248 ret = -EINVAL; 249 } 250 251 return ret; 252} 253 254static int memory_block_change_state(struct memory_block *mem, 255 unsigned long to_state, unsigned long from_state_req) 256{ 257 int ret = 0; 258 259 if (mem->state != from_state_req) 260 return -EINVAL; 261 262 if (to_state == MEM_OFFLINE) 263 mem->state = MEM_GOING_OFFLINE; 264 265 ret = memory_block_action(mem->start_section_nr, to_state, 266 mem->online_type); 267 268 mem->state = ret ? from_state_req : to_state; 269 270 return ret; 271} 272 273/* The device lock serializes operations on memory_subsys_[online|offline] */ 274static int memory_subsys_online(struct device *dev) 275{ 276 struct memory_block *mem = to_memory_block(dev); 277 int ret; 278 279 if (mem->state == MEM_ONLINE) 280 return 0; 281 282 /* 283 * If we are called from store_mem_state(), online_type will be 284 * set >= 0 Otherwise we were called from the device online 285 * attribute and need to set the online_type. 286 */ 287 if (mem->online_type < 0) 288 mem->online_type = MMOP_ONLINE_KEEP; 289 290 /* Already under protection of mem_hotplug_begin() */ 291 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 292 293 /* clear online_type */ 294 mem->online_type = -1; 295 296 return ret; 297} 298 299static int memory_subsys_offline(struct device *dev) 300{ 301 struct memory_block *mem = to_memory_block(dev); 302 303 if (mem->state == MEM_OFFLINE) 304 return 0; 305 306 /* Can't offline block with non-present sections */ 307 if (mem->section_count != sections_per_block) 308 return -EINVAL; 309 310 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 311} 312 313static ssize_t 314store_mem_state(struct device *dev, 315 struct device_attribute *attr, const char *buf, size_t count) 316{ 317 struct memory_block *mem = to_memory_block(dev); 318 int ret, online_type; 319 320 ret = lock_device_hotplug_sysfs(); 321 if (ret) 322 return ret; 323 324 if (sysfs_streq(buf, "online_kernel")) 325 online_type = MMOP_ONLINE_KERNEL; 326 else if (sysfs_streq(buf, "online_movable")) 327 online_type = MMOP_ONLINE_MOVABLE; 328 else if (sysfs_streq(buf, "online")) 329 online_type = MMOP_ONLINE_KEEP; 330 else if (sysfs_streq(buf, "offline")) 331 online_type = MMOP_OFFLINE; 332 else { 333 ret = -EINVAL; 334 goto err; 335 } 336 337 /* 338 * Memory hotplug needs to hold mem_hotplug_begin() for probe to find 339 * the correct memory block to online before doing device_online(dev), 340 * which will take dev->mutex. Take the lock early to prevent an 341 * inversion, memory_subsys_online() callbacks will be implemented by 342 * assuming it's already protected. 343 */ 344 mem_hotplug_begin(); 345 346 switch (online_type) { 347 case MMOP_ONLINE_KERNEL: 348 case MMOP_ONLINE_MOVABLE: 349 case MMOP_ONLINE_KEEP: 350 mem->online_type = online_type; 351 ret = device_online(&mem->dev); 352 break; 353 case MMOP_OFFLINE: 354 ret = device_offline(&mem->dev); 355 break; 356 default: 357 ret = -EINVAL; /* should never happen */ 358 } 359 360 mem_hotplug_done(); 361err: 362 unlock_device_hotplug(); 363 364 if (ret) 365 return ret; 366 return count; 367} 368 369/* 370 * phys_device is a bad name for this. What I really want 371 * is a way to differentiate between memory ranges that 372 * are part of physical devices that constitute 373 * a complete removable unit or fru. 374 * i.e. do these ranges belong to the same physical device, 375 * s.t. if I offline all of these sections I can then 376 * remove the physical device? 377 */ 378static ssize_t show_phys_device(struct device *dev, 379 struct device_attribute *attr, char *buf) 380{ 381 struct memory_block *mem = to_memory_block(dev); 382 return sprintf(buf, "%d\n", mem->phys_device); 383} 384 385#ifdef CONFIG_MEMORY_HOTREMOVE 386static ssize_t show_valid_zones(struct device *dev, 387 struct device_attribute *attr, char *buf) 388{ 389 struct memory_block *mem = to_memory_block(dev); 390 unsigned long start_pfn, end_pfn; 391 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 392 struct page *first_page; 393 struct zone *zone; 394 395 start_pfn = section_nr_to_pfn(mem->start_section_nr); 396 end_pfn = start_pfn + nr_pages; 397 first_page = pfn_to_page(start_pfn); 398 399 /* The block contains more than one zone can not be offlined. */ 400 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 401 return sprintf(buf, "none\n"); 402 403 zone = page_zone(first_page); 404 405 if (zone_idx(zone) == ZONE_MOVABLE - 1) { 406 /*The mem block is the last memoryblock of this zone.*/ 407 if (end_pfn == zone_end_pfn(zone)) 408 return sprintf(buf, "%s %s\n", 409 zone->name, (zone + 1)->name); 410 } 411 412 if (zone_idx(zone) == ZONE_MOVABLE) { 413 /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ 414 if (start_pfn == zone->zone_start_pfn) 415 return sprintf(buf, "%s %s\n", 416 zone->name, (zone - 1)->name); 417 } 418 419 return sprintf(buf, "%s\n", zone->name); 420} 421static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); 422#endif 423 424static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 425static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 426static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 427static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 428 429/* 430 * Block size attribute stuff 431 */ 432static ssize_t 433print_block_size(struct device *dev, struct device_attribute *attr, 434 char *buf) 435{ 436 return sprintf(buf, "%lx\n", get_memory_block_size()); 437} 438 439static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 440 441/* 442 * Some architectures will have custom drivers to do this, and 443 * will not need to do it from userspace. The fake hot-add code 444 * as well as ppc64 will do all of their discovery in userspace 445 * and will require this interface. 446 */ 447#ifdef CONFIG_ARCH_MEMORY_PROBE 448static ssize_t 449memory_probe_store(struct device *dev, struct device_attribute *attr, 450 const char *buf, size_t count) 451{ 452 u64 phys_addr; 453 int nid; 454 int i, ret; 455 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 456 457 ret = kstrtoull(buf, 0, &phys_addr); 458 if (ret) 459 return ret; 460 461 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 462 return -EINVAL; 463 464 for (i = 0; i < sections_per_block; i++) { 465 nid = memory_add_physaddr_to_nid(phys_addr); 466 ret = add_memory(nid, phys_addr, 467 PAGES_PER_SECTION << PAGE_SHIFT); 468 if (ret) 469 goto out; 470 471 phys_addr += MIN_MEMORY_BLOCK_SIZE; 472 } 473 474 ret = count; 475out: 476 return ret; 477} 478 479static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 480#endif 481 482#ifdef CONFIG_MEMORY_FAILURE 483/* 484 * Support for offlining pages of memory 485 */ 486 487/* Soft offline a page */ 488static ssize_t 489store_soft_offline_page(struct device *dev, 490 struct device_attribute *attr, 491 const char *buf, size_t count) 492{ 493 int ret; 494 u64 pfn; 495 if (!capable(CAP_SYS_ADMIN)) 496 return -EPERM; 497 if (kstrtoull(buf, 0, &pfn) < 0) 498 return -EINVAL; 499 pfn >>= PAGE_SHIFT; 500 if (!pfn_valid(pfn)) 501 return -ENXIO; 502 ret = soft_offline_page(pfn_to_page(pfn), 0); 503 return ret == 0 ? count : ret; 504} 505 506/* Forcibly offline a page, including killing processes. */ 507static ssize_t 508store_hard_offline_page(struct device *dev, 509 struct device_attribute *attr, 510 const char *buf, size_t count) 511{ 512 int ret; 513 u64 pfn; 514 if (!capable(CAP_SYS_ADMIN)) 515 return -EPERM; 516 if (kstrtoull(buf, 0, &pfn) < 0) 517 return -EINVAL; 518 pfn >>= PAGE_SHIFT; 519 ret = memory_failure(pfn, 0, 0); 520 return ret ? ret : count; 521} 522 523static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); 524static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); 525#endif 526 527/* 528 * Note that phys_device is optional. It is here to allow for 529 * differentiation between which *physical* devices each 530 * section belongs to... 531 */ 532int __weak arch_get_memory_phys_device(unsigned long start_pfn) 533{ 534 return 0; 535} 536 537/* 538 * A reference for the returned object is held and the reference for the 539 * hinted object is released. 540 */ 541struct memory_block *find_memory_block_hinted(struct mem_section *section, 542 struct memory_block *hint) 543{ 544 int block_id = base_memory_block_id(__section_nr(section)); 545 struct device *hintdev = hint ? &hint->dev : NULL; 546 struct device *dev; 547 548 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 549 if (hint) 550 put_device(&hint->dev); 551 if (!dev) 552 return NULL; 553 return to_memory_block(dev); 554} 555 556/* 557 * For now, we have a linear search to go find the appropriate 558 * memory_block corresponding to a particular phys_index. If 559 * this gets to be a real problem, we can always use a radix 560 * tree or something here. 561 * 562 * This could be made generic for all device subsystems. 563 */ 564struct memory_block *find_memory_block(struct mem_section *section) 565{ 566 return find_memory_block_hinted(section, NULL); 567} 568 569static struct attribute *memory_memblk_attrs[] = { 570 &dev_attr_phys_index.attr, 571 &dev_attr_state.attr, 572 &dev_attr_phys_device.attr, 573 &dev_attr_removable.attr, 574#ifdef CONFIG_MEMORY_HOTREMOVE 575 &dev_attr_valid_zones.attr, 576#endif 577 NULL 578}; 579 580static struct attribute_group memory_memblk_attr_group = { 581 .attrs = memory_memblk_attrs, 582}; 583 584static const struct attribute_group *memory_memblk_attr_groups[] = { 585 &memory_memblk_attr_group, 586 NULL, 587}; 588 589/* 590 * register_memory - Setup a sysfs device for a memory block 591 */ 592static 593int register_memory(struct memory_block *memory) 594{ 595 memory->dev.bus = &memory_subsys; 596 memory->dev.id = memory->start_section_nr / sections_per_block; 597 memory->dev.release = memory_block_release; 598 memory->dev.groups = memory_memblk_attr_groups; 599 memory->dev.offline = memory->state == MEM_OFFLINE; 600 601 return device_register(&memory->dev); 602} 603 604static int init_memory_block(struct memory_block **memory, 605 struct mem_section *section, unsigned long state) 606{ 607 struct memory_block *mem; 608 unsigned long start_pfn; 609 int scn_nr; 610 int ret = 0; 611 612 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 613 if (!mem) 614 return -ENOMEM; 615 616 scn_nr = __section_nr(section); 617 mem->start_section_nr = 618 base_memory_block_id(scn_nr) * sections_per_block; 619 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 620 mem->state = state; 621 mem->section_count++; 622 start_pfn = section_nr_to_pfn(mem->start_section_nr); 623 mem->phys_device = arch_get_memory_phys_device(start_pfn); 624 625 ret = register_memory(mem); 626 627 *memory = mem; 628 return ret; 629} 630 631static int add_memory_block(int base_section_nr) 632{ 633 struct memory_block *mem; 634 int i, ret, section_count = 0, section_nr; 635 636 for (i = base_section_nr; 637 (i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS; 638 i++) { 639 if (!present_section_nr(i)) 640 continue; 641 if (section_count == 0) 642 section_nr = i; 643 section_count++; 644 } 645 646 if (section_count == 0) 647 return 0; 648 ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); 649 if (ret) 650 return ret; 651 mem->section_count = section_count; 652 return 0; 653} 654 655 656/* 657 * need an interface for the VM to add new memory regions, 658 * but without onlining it. 659 */ 660int register_new_memory(int nid, struct mem_section *section) 661{ 662 int ret = 0; 663 struct memory_block *mem; 664 665 mutex_lock(&mem_sysfs_mutex); 666 667 mem = find_memory_block(section); 668 if (mem) { 669 mem->section_count++; 670 put_device(&mem->dev); 671 } else { 672 ret = init_memory_block(&mem, section, MEM_OFFLINE); 673 if (ret) 674 goto out; 675 } 676 677 if (mem->section_count == sections_per_block) 678 ret = register_mem_sect_under_node(mem, nid); 679out: 680 mutex_unlock(&mem_sysfs_mutex); 681 return ret; 682} 683 684#ifdef CONFIG_MEMORY_HOTREMOVE 685static void 686unregister_memory(struct memory_block *memory) 687{ 688 BUG_ON(memory->dev.bus != &memory_subsys); 689 690 /* drop the ref. we got in remove_memory_block() */ 691 put_device(&memory->dev); 692 device_unregister(&memory->dev); 693} 694 695static int remove_memory_block(unsigned long node_id, 696 struct mem_section *section, int phys_device) 697{ 698 struct memory_block *mem; 699 700 mutex_lock(&mem_sysfs_mutex); 701 mem = find_memory_block(section); 702 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 703 704 mem->section_count--; 705 if (mem->section_count == 0) 706 unregister_memory(mem); 707 else 708 put_device(&mem->dev); 709 710 mutex_unlock(&mem_sysfs_mutex); 711 return 0; 712} 713 714int unregister_memory_section(struct mem_section *section) 715{ 716 if (!present_section(section)) 717 return -EINVAL; 718 719 return remove_memory_block(0, section, 0); 720} 721#endif /* CONFIG_MEMORY_HOTREMOVE */ 722 723/* return true if the memory block is offlined, otherwise, return false */ 724bool is_memblock_offlined(struct memory_block *mem) 725{ 726 return mem->state == MEM_OFFLINE; 727} 728 729static struct attribute *memory_root_attrs[] = { 730#ifdef CONFIG_ARCH_MEMORY_PROBE 731 &dev_attr_probe.attr, 732#endif 733 734#ifdef CONFIG_MEMORY_FAILURE 735 &dev_attr_soft_offline_page.attr, 736 &dev_attr_hard_offline_page.attr, 737#endif 738 739 &dev_attr_block_size_bytes.attr, 740 NULL 741}; 742 743static struct attribute_group memory_root_attr_group = { 744 .attrs = memory_root_attrs, 745}; 746 747static const struct attribute_group *memory_root_attr_groups[] = { 748 &memory_root_attr_group, 749 NULL, 750}; 751 752/* 753 * Initialize the sysfs support for memory devices... 754 */ 755int __init memory_dev_init(void) 756{ 757 unsigned int i; 758 int ret; 759 int err; 760 unsigned long block_sz; 761 762 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 763 if (ret) 764 goto out; 765 766 block_sz = get_memory_block_size(); 767 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 768 769 /* 770 * Create entries for memory sections that were found 771 * during boot and have been initialized 772 */ 773 mutex_lock(&mem_sysfs_mutex); 774 for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { 775 err = add_memory_block(i); 776 if (!ret) 777 ret = err; 778 } 779 mutex_unlock(&mem_sysfs_mutex); 780 781out: 782 if (ret) 783 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 784 return ret; 785} 786