1/* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7#include <linux/module.h> 8#include <linux/sched.h> 9#include <linux/mm.h> 10#include <linux/pm.h> 11#include <linux/memblock.h> 12#include <linux/cpuidle.h> 13#include <linux/cpufreq.h> 14 15#include <asm/elf.h> 16#include <asm/vdso.h> 17#include <asm/e820.h> 18#include <asm/setup.h> 19#include <asm/acpi.h> 20#include <asm/numa.h> 21#include <asm/xen/hypervisor.h> 22#include <asm/xen/hypercall.h> 23 24#include <xen/xen.h> 25#include <xen/page.h> 26#include <xen/interface/callback.h> 27#include <xen/interface/memory.h> 28#include <xen/interface/physdev.h> 29#include <xen/features.h> 30#include "xen-ops.h" 31#include "vdso.h" 32#include "p2m.h" 33#include "mmu.h" 34 35/* Amount of extra memory space we add to the e820 ranges */ 36struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 37 38/* Number of pages released from the initial allocation. */ 39unsigned long xen_released_pages; 40 41/* 42 * Buffer used to remap identity mapped pages. We only need the virtual space. 43 * The physical page behind this address is remapped as needed to different 44 * buffer pages. 45 */ 46#define REMAP_SIZE (P2M_PER_PAGE - 3) 47static struct { 48 unsigned long next_area_mfn; 49 unsigned long target_pfn; 50 unsigned long size; 51 unsigned long mfns[REMAP_SIZE]; 52} xen_remap_buf __initdata __aligned(PAGE_SIZE); 53static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; 54 55/* 56 * The maximum amount of extra memory compared to the base size. The 57 * main scaling factor is the size of struct page. At extreme ratios 58 * of base:extra, all the base memory can be filled with page 59 * structures for the extra memory, leaving no space for anything 60 * else. 61 * 62 * 10x seems like a reasonable balance between scaling flexibility and 63 * leaving a practically usable system. 64 */ 65#define EXTRA_MEM_RATIO (10) 66 67static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) 68{ 69 int i; 70 71 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 72 /* Add new region. */ 73 if (xen_extra_mem[i].size == 0) { 74 xen_extra_mem[i].start = start; 75 xen_extra_mem[i].size = size; 76 break; 77 } 78 /* Append to existing region. */ 79 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 80 xen_extra_mem[i].size += size; 81 break; 82 } 83 } 84 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 85 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 86 87 memblock_reserve(start, size); 88} 89 90static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) 91{ 92 int i; 93 phys_addr_t start_r, size_r; 94 95 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 96 start_r = xen_extra_mem[i].start; 97 size_r = xen_extra_mem[i].size; 98 99 /* Start of region. */ 100 if (start_r == start) { 101 BUG_ON(size > size_r); 102 xen_extra_mem[i].start += size; 103 xen_extra_mem[i].size -= size; 104 break; 105 } 106 /* End of region. */ 107 if (start_r + size_r == start + size) { 108 BUG_ON(size > size_r); 109 xen_extra_mem[i].size -= size; 110 break; 111 } 112 /* Mid of region. */ 113 if (start > start_r && start < start_r + size_r) { 114 BUG_ON(start + size > start_r + size_r); 115 xen_extra_mem[i].size = start - start_r; 116 /* Calling memblock_reserve() again is okay. */ 117 xen_add_extra_mem(start + size, start_r + size_r - 118 (start + size)); 119 break; 120 } 121 } 122 memblock_free(start, size); 123} 124 125/* 126 * Called during boot before the p2m list can take entries beyond the 127 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as 128 * invalid. 129 */ 130unsigned long __ref xen_chk_extra_mem(unsigned long pfn) 131{ 132 int i; 133 phys_addr_t addr = PFN_PHYS(pfn); 134 135 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 136 if (addr >= xen_extra_mem[i].start && 137 addr < xen_extra_mem[i].start + xen_extra_mem[i].size) 138 return INVALID_P2M_ENTRY; 139 } 140 141 return IDENTITY_FRAME(pfn); 142} 143 144/* 145 * Mark all pfns of extra mem as invalid in p2m list. 146 */ 147void __init xen_inv_extra_mem(void) 148{ 149 unsigned long pfn, pfn_s, pfn_e; 150 int i; 151 152 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 153 if (!xen_extra_mem[i].size) 154 continue; 155 pfn_s = PFN_DOWN(xen_extra_mem[i].start); 156 pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); 157 for (pfn = pfn_s; pfn < pfn_e; pfn++) 158 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 159 } 160} 161 162/* 163 * Finds the next RAM pfn available in the E820 map after min_pfn. 164 * This function updates min_pfn with the pfn found and returns 165 * the size of that range or zero if not found. 166 */ 167static unsigned long __init xen_find_pfn_range( 168 const struct e820entry *list, size_t map_size, 169 unsigned long *min_pfn) 170{ 171 const struct e820entry *entry; 172 unsigned int i; 173 unsigned long done = 0; 174 175 for (i = 0, entry = list; i < map_size; i++, entry++) { 176 unsigned long s_pfn; 177 unsigned long e_pfn; 178 179 if (entry->type != E820_RAM) 180 continue; 181 182 e_pfn = PFN_DOWN(entry->addr + entry->size); 183 184 /* We only care about E820 after this */ 185 if (e_pfn < *min_pfn) 186 continue; 187 188 s_pfn = PFN_UP(entry->addr); 189 190 /* If min_pfn falls within the E820 entry, we want to start 191 * at the min_pfn PFN. 192 */ 193 if (s_pfn <= *min_pfn) { 194 done = e_pfn - *min_pfn; 195 } else { 196 done = e_pfn - s_pfn; 197 *min_pfn = s_pfn; 198 } 199 break; 200 } 201 202 return done; 203} 204 205static int __init xen_free_mfn(unsigned long mfn) 206{ 207 struct xen_memory_reservation reservation = { 208 .address_bits = 0, 209 .extent_order = 0, 210 .domid = DOMID_SELF 211 }; 212 213 set_xen_guest_handle(reservation.extent_start, &mfn); 214 reservation.nr_extents = 1; 215 216 return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); 217} 218 219/* 220 * This releases a chunk of memory and then does the identity map. It's used 221 * as a fallback if the remapping fails. 222 */ 223static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, 224 unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) 225{ 226 unsigned long pfn, end; 227 int ret; 228 229 WARN_ON(start_pfn > end_pfn); 230 231 /* Release pages first. */ 232 end = min(end_pfn, nr_pages); 233 for (pfn = start_pfn; pfn < end; pfn++) { 234 unsigned long mfn = pfn_to_mfn(pfn); 235 236 /* Make sure pfn exists to start with */ 237 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 238 continue; 239 240 ret = xen_free_mfn(mfn); 241 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 242 243 if (ret == 1) { 244 (*released)++; 245 if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) 246 break; 247 } else 248 break; 249 } 250 251 set_phys_range_identity(start_pfn, end_pfn); 252} 253 254/* 255 * Helper function to update the p2m and m2p tables and kernel mapping. 256 */ 257static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) 258{ 259 struct mmu_update update = { 260 .ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, 261 .val = pfn 262 }; 263 264 /* Update p2m */ 265 if (!set_phys_to_machine(pfn, mfn)) { 266 WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", 267 pfn, mfn); 268 BUG(); 269 } 270 271 /* Update m2p */ 272 if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { 273 WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", 274 mfn, pfn); 275 BUG(); 276 } 277 278 /* Update kernel mapping, but not for highmem. */ 279 if (pfn >= PFN_UP(__pa(high_memory - 1))) 280 return; 281 282 if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), 283 mfn_pte(mfn, PAGE_KERNEL), 0)) { 284 WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", 285 mfn, pfn); 286 BUG(); 287 } 288} 289 290/* 291 * This function updates the p2m and m2p tables with an identity map from 292 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the 293 * original allocation at remap_pfn. The information needed for remapping is 294 * saved in the memory itself to avoid the need for allocating buffers. The 295 * complete remap information is contained in a list of MFNs each containing 296 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap. 297 * This enables us to preserve the original mfn sequence while doing the 298 * remapping at a time when the memory management is capable of allocating 299 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and 300 * its callers. 301 */ 302static void __init xen_do_set_identity_and_remap_chunk( 303 unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) 304{ 305 unsigned long buf = (unsigned long)&xen_remap_buf; 306 unsigned long mfn_save, mfn; 307 unsigned long ident_pfn_iter, remap_pfn_iter; 308 unsigned long ident_end_pfn = start_pfn + size; 309 unsigned long left = size; 310 unsigned int i, chunk; 311 312 WARN_ON(size == 0); 313 314 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); 315 316 mfn_save = virt_to_mfn(buf); 317 318 for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; 319 ident_pfn_iter < ident_end_pfn; 320 ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) { 321 chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE; 322 323 /* Map first pfn to xen_remap_buf */ 324 mfn = pfn_to_mfn(ident_pfn_iter); 325 set_pte_mfn(buf, mfn, PAGE_KERNEL); 326 327 /* Save mapping information in page */ 328 xen_remap_buf.next_area_mfn = xen_remap_mfn; 329 xen_remap_buf.target_pfn = remap_pfn_iter; 330 xen_remap_buf.size = chunk; 331 for (i = 0; i < chunk; i++) 332 xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i); 333 334 /* Put remap buf into list. */ 335 xen_remap_mfn = mfn; 336 337 /* Set identity map */ 338 set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk); 339 340 left -= chunk; 341 } 342 343 /* Restore old xen_remap_buf mapping */ 344 set_pte_mfn(buf, mfn_save, PAGE_KERNEL); 345} 346 347/* 348 * This function takes a contiguous pfn range that needs to be identity mapped 349 * and: 350 * 351 * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. 352 * 2) Calls the do_ function to actually do the mapping/remapping work. 353 * 354 * The goal is to not allocate additional memory but to remap the existing 355 * pages. In the case of an error the underlying memory is simply released back 356 * to Xen and not remapped. 357 */ 358static unsigned long __init xen_set_identity_and_remap_chunk( 359 const struct e820entry *list, size_t map_size, unsigned long start_pfn, 360 unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, 361 unsigned long *released, unsigned long *remapped) 362{ 363 unsigned long pfn; 364 unsigned long i = 0; 365 unsigned long n = end_pfn - start_pfn; 366 367 while (i < n) { 368 unsigned long cur_pfn = start_pfn + i; 369 unsigned long left = n - i; 370 unsigned long size = left; 371 unsigned long remap_range_size; 372 373 /* Do not remap pages beyond the current allocation */ 374 if (cur_pfn >= nr_pages) { 375 /* Identity map remaining pages */ 376 set_phys_range_identity(cur_pfn, cur_pfn + size); 377 break; 378 } 379 if (cur_pfn + size > nr_pages) 380 size = nr_pages - cur_pfn; 381 382 remap_range_size = xen_find_pfn_range(list, map_size, 383 &remap_pfn); 384 if (!remap_range_size) { 385 pr_warning("Unable to find available pfn range, not remapping identity pages\n"); 386 xen_set_identity_and_release_chunk(cur_pfn, 387 cur_pfn + left, nr_pages, released); 388 break; 389 } 390 /* Adjust size to fit in current e820 RAM region */ 391 if (size > remap_range_size) 392 size = remap_range_size; 393 394 xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn); 395 396 /* Update variables to reflect new mappings. */ 397 i += size; 398 remap_pfn += size; 399 *remapped += size; 400 } 401 402 /* 403 * If the PFNs are currently mapped, the VA mapping also needs 404 * to be updated to be 1:1. 405 */ 406 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) 407 (void)HYPERVISOR_update_va_mapping( 408 (unsigned long)__va(pfn << PAGE_SHIFT), 409 mfn_pte(pfn, PAGE_KERNEL_IO), 0); 410 411 return remap_pfn; 412} 413 414static void __init xen_set_identity_and_remap( 415 const struct e820entry *list, size_t map_size, unsigned long nr_pages, 416 unsigned long *released, unsigned long *remapped) 417{ 418 phys_addr_t start = 0; 419 unsigned long last_pfn = nr_pages; 420 const struct e820entry *entry; 421 unsigned long num_released = 0; 422 unsigned long num_remapped = 0; 423 int i; 424 425 /* 426 * Combine non-RAM regions and gaps until a RAM region (or the 427 * end of the map) is reached, then set the 1:1 map and 428 * remap the memory in those non-RAM regions. 429 * 430 * The combined non-RAM regions are rounded to a whole number 431 * of pages so any partial pages are accessible via the 1:1 432 * mapping. This is needed for some BIOSes that put (for 433 * example) the DMI tables in a reserved region that begins on 434 * a non-page boundary. 435 */ 436 for (i = 0, entry = list; i < map_size; i++, entry++) { 437 phys_addr_t end = entry->addr + entry->size; 438 if (entry->type == E820_RAM || i == map_size - 1) { 439 unsigned long start_pfn = PFN_DOWN(start); 440 unsigned long end_pfn = PFN_UP(end); 441 442 if (entry->type == E820_RAM) 443 end_pfn = PFN_UP(entry->addr); 444 445 if (start_pfn < end_pfn) 446 last_pfn = xen_set_identity_and_remap_chunk( 447 list, map_size, start_pfn, 448 end_pfn, nr_pages, last_pfn, 449 &num_released, &num_remapped); 450 start = end; 451 } 452 } 453 454 *released = num_released; 455 *remapped = num_remapped; 456 457 pr_info("Released %ld page(s)\n", num_released); 458} 459 460/* 461 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk(). 462 * The remap information (which mfn remap to which pfn) is contained in the 463 * to be remapped memory itself in a linked list anchored at xen_remap_mfn. 464 * This scheme allows to remap the different chunks in arbitrary order while 465 * the resulting mapping will be independant from the order. 466 */ 467void __init xen_remap_memory(void) 468{ 469 unsigned long buf = (unsigned long)&xen_remap_buf; 470 unsigned long mfn_save, mfn, pfn; 471 unsigned long remapped = 0; 472 unsigned int i; 473 unsigned long pfn_s = ~0UL; 474 unsigned long len = 0; 475 476 mfn_save = virt_to_mfn(buf); 477 478 while (xen_remap_mfn != INVALID_P2M_ENTRY) { 479 /* Map the remap information */ 480 set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL); 481 482 BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]); 483 484 pfn = xen_remap_buf.target_pfn; 485 for (i = 0; i < xen_remap_buf.size; i++) { 486 mfn = xen_remap_buf.mfns[i]; 487 xen_update_mem_tables(pfn, mfn); 488 remapped++; 489 pfn++; 490 } 491 if (pfn_s == ~0UL || pfn == pfn_s) { 492 pfn_s = xen_remap_buf.target_pfn; 493 len += xen_remap_buf.size; 494 } else if (pfn_s + len == xen_remap_buf.target_pfn) { 495 len += xen_remap_buf.size; 496 } else { 497 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); 498 pfn_s = xen_remap_buf.target_pfn; 499 len = xen_remap_buf.size; 500 } 501 502 mfn = xen_remap_mfn; 503 xen_remap_mfn = xen_remap_buf.next_area_mfn; 504 } 505 506 if (pfn_s != ~0UL && len) 507 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); 508 509 set_pte_mfn(buf, mfn_save, PAGE_KERNEL); 510 511 pr_info("Remapped %ld page(s)\n", remapped); 512} 513 514static unsigned long __init xen_get_max_pages(void) 515{ 516 unsigned long max_pages = MAX_DOMAIN_PAGES; 517 domid_t domid = DOMID_SELF; 518 int ret; 519 520 /* 521 * For the initial domain we use the maximum reservation as 522 * the maximum page. 523 * 524 * For guest domains the current maximum reservation reflects 525 * the current maximum rather than the static maximum. In this 526 * case the e820 map provided to us will cover the static 527 * maximum region. 528 */ 529 if (xen_initial_domain()) { 530 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 531 if (ret > 0) 532 max_pages = ret; 533 } 534 535 return min(max_pages, MAX_DOMAIN_PAGES); 536} 537 538static void __init xen_align_and_add_e820_region(phys_addr_t start, 539 phys_addr_t size, int type) 540{ 541 phys_addr_t end = start + size; 542 543 /* Align RAM regions to page boundaries. */ 544 if (type == E820_RAM) { 545 start = PAGE_ALIGN(start); 546 end &= ~((phys_addr_t)PAGE_SIZE - 1); 547 } 548 549 e820_add_region(start, end - start, type); 550} 551 552static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) 553{ 554 struct e820entry *entry; 555 unsigned int i; 556 557 for (i = 0, entry = list; i < map_size; i++, entry++) { 558 if (entry->type == E820_UNUSABLE) 559 entry->type = E820_RAM; 560 } 561} 562 563/** 564 * machine_specific_memory_setup - Hook for machine specific memory setup. 565 **/ 566char * __init xen_memory_setup(void) 567{ 568 static struct e820entry map[E820MAX] __initdata; 569 570 unsigned long max_pfn = xen_start_info->nr_pages; 571 phys_addr_t mem_end; 572 int rc; 573 struct xen_memory_map memmap; 574 unsigned long max_pages; 575 unsigned long extra_pages = 0; 576 unsigned long remapped_pages; 577 int i; 578 int op; 579 580 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 581 mem_end = PFN_PHYS(max_pfn); 582 583 memmap.nr_entries = E820MAX; 584 set_xen_guest_handle(memmap.buffer, map); 585 586 op = xen_initial_domain() ? 587 XENMEM_machine_memory_map : 588 XENMEM_memory_map; 589 rc = HYPERVISOR_memory_op(op, &memmap); 590 if (rc == -ENOSYS) { 591 BUG_ON(xen_initial_domain()); 592 memmap.nr_entries = 1; 593 map[0].addr = 0ULL; 594 map[0].size = mem_end; 595 /* 8MB slack (to balance backend allocations). */ 596 map[0].size += 8ULL << 20; 597 map[0].type = E820_RAM; 598 rc = 0; 599 } 600 BUG_ON(rc); 601 BUG_ON(memmap.nr_entries == 0); 602 603 /* 604 * Xen won't allow a 1:1 mapping to be created to UNUSABLE 605 * regions, so if we're using the machine memory map leave the 606 * region as RAM as it is in the pseudo-physical map. 607 * 608 * UNUSABLE regions in domUs are not handled and will need 609 * a patch in the future. 610 */ 611 if (xen_initial_domain()) 612 xen_ignore_unusable(map, memmap.nr_entries); 613 614 /* Make sure the Xen-supplied memory map is well-ordered. */ 615 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 616 617 max_pages = xen_get_max_pages(); 618 if (max_pages > max_pfn) 619 extra_pages += max_pages - max_pfn; 620 621 /* 622 * Set identity map on non-RAM pages and prepare remapping the 623 * underlying RAM. 624 */ 625 xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, 626 &xen_released_pages, &remapped_pages); 627 628 extra_pages += xen_released_pages; 629 extra_pages += remapped_pages; 630 631 /* 632 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 633 * factor the base size. On non-highmem systems, the base 634 * size is the full initial memory allocation; on highmem it 635 * is limited to the max size of lowmem, so that it doesn't 636 * get completely filled. 637 * 638 * In principle there could be a problem in lowmem systems if 639 * the initial memory is also very large with respect to 640 * lowmem, but we won't try to deal with that here. 641 */ 642 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 643 extra_pages); 644 i = 0; 645 while (i < memmap.nr_entries) { 646 phys_addr_t addr = map[i].addr; 647 phys_addr_t size = map[i].size; 648 u32 type = map[i].type; 649 650 if (type == E820_RAM) { 651 if (addr < mem_end) { 652 size = min(size, mem_end - addr); 653 } else if (extra_pages) { 654 size = min(size, PFN_PHYS(extra_pages)); 655 extra_pages -= PFN_DOWN(size); 656 xen_add_extra_mem(addr, size); 657 xen_max_p2m_pfn = PFN_DOWN(addr + size); 658 } else 659 type = E820_UNUSABLE; 660 } 661 662 xen_align_and_add_e820_region(addr, size, type); 663 664 map[i].addr += size; 665 map[i].size -= size; 666 if (map[i].size == 0) 667 i++; 668 } 669 670 /* 671 * Set the rest as identity mapped, in case PCI BARs are 672 * located here. 673 * 674 * PFNs above MAX_P2M_PFN are considered identity mapped as 675 * well. 676 */ 677 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); 678 679 /* 680 * In domU, the ISA region is normal, usable memory, but we 681 * reserve ISA memory anyway because too many things poke 682 * about in there. 683 */ 684 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 685 E820_RESERVED); 686 687 /* 688 * Reserve Xen bits: 689 * - mfn_list 690 * - xen_start_info 691 * See comment above "struct start_info" in <xen/interface/xen.h> 692 * We tried to make the the memblock_reserve more selective so 693 * that it would be clear what region is reserved. Sadly we ran 694 * in the problem wherein on a 64-bit hypervisor with a 32-bit 695 * initial domain, the pt_base has the cr3 value which is not 696 * neccessarily where the pagetable starts! As Jan put it: " 697 * Actually, the adjustment turns out to be correct: The page 698 * tables for a 32-on-64 dom0 get allocated in the order "first L1", 699 * "first L2", "first L3", so the offset to the page table base is 700 * indeed 2. When reading xen/include/public/xen.h's comment 701 * very strictly, this is not a violation (since there nothing is said 702 * that the first thing in the page table space is pointed to by 703 * pt_base; I admit that this seems to be implied though, namely 704 * do I think that it is implied that the page table space is the 705 * range [pt_base, pt_base + nt_pt_frames), whereas that 706 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), 707 * which - without a priori knowledge - the kernel would have 708 * difficulty to figure out)." - so lets just fall back to the 709 * easy way and reserve the whole region. 710 */ 711 memblock_reserve(__pa(xen_start_info->mfn_list), 712 xen_start_info->pt_base - xen_start_info->mfn_list); 713 714 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 715 716 return "Xen"; 717} 718 719/* 720 * Machine specific memory setup for auto-translated guests. 721 */ 722char * __init xen_auto_xlated_memory_setup(void) 723{ 724 static struct e820entry map[E820MAX] __initdata; 725 726 struct xen_memory_map memmap; 727 int i; 728 int rc; 729 730 memmap.nr_entries = E820MAX; 731 set_xen_guest_handle(memmap.buffer, map); 732 733 rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 734 if (rc < 0) 735 panic("No memory map (%d)\n", rc); 736 737 sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); 738 739 for (i = 0; i < memmap.nr_entries; i++) 740 e820_add_region(map[i].addr, map[i].size, map[i].type); 741 742 memblock_reserve(__pa(xen_start_info->mfn_list), 743 xen_start_info->pt_base - xen_start_info->mfn_list); 744 745 return "Xen"; 746} 747 748/* 749 * Set the bit indicating "nosegneg" library variants should be used. 750 * We only need to bother in pure 32-bit mode; compat 32-bit processes 751 * can have un-truncated segments, so wrapping around is allowed. 752 */ 753static void __init fiddle_vdso(void) 754{ 755#ifdef CONFIG_X86_32 756 /* 757 * This could be called before selected_vdso32 is initialized, so 758 * just fiddle with both possible images. vdso_image_32_syscall 759 * can't be selected, since it only exists on 64-bit systems. 760 */ 761 u32 *mask; 762 mask = vdso_image_32_int80.data + 763 vdso_image_32_int80.sym_VDSO32_NOTE_MASK; 764 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 765 mask = vdso_image_32_sysenter.data + 766 vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; 767 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 768#endif 769} 770 771static int register_callback(unsigned type, const void *func) 772{ 773 struct callback_register callback = { 774 .type = type, 775 .address = XEN_CALLBACK(__KERNEL_CS, func), 776 .flags = CALLBACKF_mask_events, 777 }; 778 779 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 780} 781 782void xen_enable_sysenter(void) 783{ 784 int ret; 785 unsigned sysenter_feature; 786 787#ifdef CONFIG_X86_32 788 sysenter_feature = X86_FEATURE_SEP; 789#else 790 sysenter_feature = X86_FEATURE_SYSENTER32; 791#endif 792 793 if (!boot_cpu_has(sysenter_feature)) 794 return; 795 796 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 797 if(ret != 0) 798 setup_clear_cpu_cap(sysenter_feature); 799} 800 801void xen_enable_syscall(void) 802{ 803#ifdef CONFIG_X86_64 804 int ret; 805 806 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 807 if (ret != 0) { 808 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 809 /* Pretty fatal; 64-bit userspace has no other 810 mechanism for syscalls. */ 811 } 812 813 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 814 ret = register_callback(CALLBACKTYPE_syscall32, 815 xen_syscall32_target); 816 if (ret != 0) 817 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 818 } 819#endif /* CONFIG_X86_64 */ 820} 821 822void __init xen_pvmmu_arch_setup(void) 823{ 824 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 825 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 826 827 HYPERVISOR_vm_assist(VMASST_CMD_enable, 828 VMASST_TYPE_pae_extended_cr3); 829 830 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 831 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 832 BUG(); 833 834 xen_enable_sysenter(); 835 xen_enable_syscall(); 836} 837 838/* This function is not called for HVM domains */ 839void __init xen_arch_setup(void) 840{ 841 xen_panic_handler_init(); 842 if (!xen_feature(XENFEAT_auto_translated_physmap)) 843 xen_pvmmu_arch_setup(); 844 845#ifdef CONFIG_ACPI 846 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 847 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 848 disable_acpi(); 849 } 850#endif 851 852 memcpy(boot_command_line, xen_start_info->cmd_line, 853 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 854 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 855 856 /* Set up idle, making sure it calls safe_halt() pvop */ 857 disable_cpuidle(); 858 disable_cpufreq(); 859 WARN_ON(xen_set_default_idle()); 860 fiddle_vdso(); 861#ifdef CONFIG_NUMA 862 numa_off = 1; 863#endif 864} 865