1/* 2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * (C) Rusty Russell, IBM 2002 4 * 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 7 * 8 * Removed page pinning, fix privately mapped COW pages and other cleanups 9 * (C) Copyright 2003, 2004 Jamie Lokier 10 * 11 * Robust futex support started by Ingo Molnar 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 14 * 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 18 * 19 * PRIVATE futexes by Eric Dumazet 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 21 * 22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 23 * Copyright (C) IBM Corporation, 2009 24 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 25 * 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 27 * enough at me, Linus for the original (flawed) idea, Matthew 28 * Kirkwood for proof-of-concept implementation. 29 * 30 * "The futexes are also cursed." 31 * "But they come in a choice of three flavours!" 32 * 33 * This program is free software; you can redistribute it and/or modify 34 * it under the terms of the GNU General Public License as published by 35 * the Free Software Foundation; either version 2 of the License, or 36 * (at your option) any later version. 37 * 38 * This program is distributed in the hope that it will be useful, 39 * but WITHOUT ANY WARRANTY; without even the implied warranty of 40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 41 * GNU General Public License for more details. 42 * 43 * You should have received a copy of the GNU General Public License 44 * along with this program; if not, write to the Free Software 45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 46 */ 47#include <linux/slab.h> 48#include <linux/poll.h> 49#include <linux/fs.h> 50#include <linux/file.h> 51#include <linux/jhash.h> 52#include <linux/init.h> 53#include <linux/futex.h> 54#include <linux/mount.h> 55#include <linux/pagemap.h> 56#include <linux/syscalls.h> 57#include <linux/signal.h> 58#include <linux/export.h> 59#include <linux/magic.h> 60#include <linux/pid.h> 61#include <linux/nsproxy.h> 62#include <linux/ptrace.h> 63#include <linux/sched/rt.h> 64#include <linux/hugetlb.h> 65#include <linux/freezer.h> 66#include <linux/bootmem.h> 67 68#include <asm/futex.h> 69 70#include "locking/rtmutex_common.h" 71 72/* 73 * READ this before attempting to hack on futexes! 74 * 75 * Basic futex operation and ordering guarantees 76 * ============================================= 77 * 78 * The waiter reads the futex value in user space and calls 79 * futex_wait(). This function computes the hash bucket and acquires 80 * the hash bucket lock. After that it reads the futex user space value 81 * again and verifies that the data has not changed. If it has not changed 82 * it enqueues itself into the hash bucket, releases the hash bucket lock 83 * and schedules. 84 * 85 * The waker side modifies the user space value of the futex and calls 86 * futex_wake(). This function computes the hash bucket and acquires the 87 * hash bucket lock. Then it looks for waiters on that futex in the hash 88 * bucket and wakes them. 89 * 90 * In futex wake up scenarios where no tasks are blocked on a futex, taking 91 * the hb spinlock can be avoided and simply return. In order for this 92 * optimization to work, ordering guarantees must exist so that the waiter 93 * being added to the list is acknowledged when the list is concurrently being 94 * checked by the waker, avoiding scenarios like the following: 95 * 96 * CPU 0 CPU 1 97 * val = *futex; 98 * sys_futex(WAIT, futex, val); 99 * futex_wait(futex, val); 100 * uval = *futex; 101 * *futex = newval; 102 * sys_futex(WAKE, futex); 103 * futex_wake(futex); 104 * if (queue_empty()) 105 * return; 106 * if (uval == val) 107 * lock(hash_bucket(futex)); 108 * queue(); 109 * unlock(hash_bucket(futex)); 110 * schedule(); 111 * 112 * This would cause the waiter on CPU 0 to wait forever because it 113 * missed the transition of the user space value from val to newval 114 * and the waker did not find the waiter in the hash bucket queue. 115 * 116 * The correct serialization ensures that a waiter either observes 117 * the changed user space value before blocking or is woken by a 118 * concurrent waker: 119 * 120 * CPU 0 CPU 1 121 * val = *futex; 122 * sys_futex(WAIT, futex, val); 123 * futex_wait(futex, val); 124 * 125 * waiters++; (a) 126 * mb(); (A) <-- paired with -. 127 * | 128 * lock(hash_bucket(futex)); | 129 * | 130 * uval = *futex; | 131 * | *futex = newval; 132 * | sys_futex(WAKE, futex); 133 * | futex_wake(futex); 134 * | 135 * `-------> mb(); (B) 136 * if (uval == val) 137 * queue(); 138 * unlock(hash_bucket(futex)); 139 * schedule(); if (waiters) 140 * lock(hash_bucket(futex)); 141 * else wake_waiters(futex); 142 * waiters--; (b) unlock(hash_bucket(futex)); 143 * 144 * Where (A) orders the waiters increment and the futex value read through 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write 146 * to futex and the waiters read -- this is done by the barriers for both 147 * shared and private futexes in get_futex_key_refs(). 148 * 149 * This yields the following case (where X:=waiters, Y:=futex): 150 * 151 * X = Y = 0 152 * 153 * w[X]=1 w[Y]=1 154 * MB MB 155 * r[Y]=y r[X]=x 156 * 157 * Which guarantees that x==0 && y==0 is impossible; which translates back into 158 * the guarantee that we cannot both miss the futex variable change and the 159 * enqueue. 160 * 161 * Note that a new waiter is accounted for in (a) even when it is possible that 162 * the wait call can return error, in which case we backtrack from it in (b). 163 * Refer to the comment in queue_lock(). 164 * 165 * Similarly, in order to account for waiters being requeued on another 166 * address we always increment the waiters for the destination bucket before 167 * acquiring the lock. It then decrements them again after releasing it - 168 * the code that actually moves the futex(es) between hash buckets (requeue_futex) 169 * will do the additional required waiter count housekeeping. This is done for 170 * double_lock_hb() and double_unlock_hb(), respectively. 171 */ 172 173#ifndef CONFIG_HAVE_FUTEX_CMPXCHG 174int __read_mostly futex_cmpxchg_enabled; 175#endif 176 177/* 178 * Futex flags used to encode options to functions and preserve them across 179 * restarts. 180 */ 181#define FLAGS_SHARED 0x01 182#define FLAGS_CLOCKRT 0x02 183#define FLAGS_HAS_TIMEOUT 0x04 184 185/* 186 * Priority Inheritance state: 187 */ 188struct futex_pi_state { 189 /* 190 * list of 'owned' pi_state instances - these have to be 191 * cleaned up in do_exit() if the task exits prematurely: 192 */ 193 struct list_head list; 194 195 /* 196 * The PI object: 197 */ 198 struct rt_mutex pi_mutex; 199 200 struct task_struct *owner; 201 atomic_t refcount; 202 203 union futex_key key; 204}; 205 206/** 207 * struct futex_q - The hashed futex queue entry, one per waiting task 208 * @list: priority-sorted list of tasks waiting on this futex 209 * @task: the task waiting on the futex 210 * @lock_ptr: the hash bucket lock 211 * @key: the key the futex is hashed on 212 * @pi_state: optional priority inheritance state 213 * @rt_waiter: rt_waiter storage for use with requeue_pi 214 * @requeue_pi_key: the requeue_pi target futex key 215 * @bitset: bitset for the optional bitmasked wakeup 216 * 217 * We use this hashed waitqueue, instead of a normal wait_queue_t, so 218 * we can wake only the relevant ones (hashed queues may be shared). 219 * 220 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 221 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 222 * The order of wakeup is always to make the first condition true, then 223 * the second. 224 * 225 * PI futexes are typically woken before they are removed from the hash list via 226 * the rt_mutex code. See unqueue_me_pi(). 227 */ 228struct futex_q { 229 struct plist_node list; 230 231 struct task_struct *task; 232 spinlock_t *lock_ptr; 233 union futex_key key; 234 struct futex_pi_state *pi_state; 235 struct rt_mutex_waiter *rt_waiter; 236 union futex_key *requeue_pi_key; 237 u32 bitset; 238}; 239 240static const struct futex_q futex_q_init = { 241 /* list gets initialized in queue_me()*/ 242 .key = FUTEX_KEY_INIT, 243 .bitset = FUTEX_BITSET_MATCH_ANY 244}; 245 246/* 247 * Hash buckets are shared by all the futex_keys that hash to the same 248 * location. Each key may have multiple futex_q structures, one for each task 249 * waiting on a futex. 250 */ 251struct futex_hash_bucket { 252 atomic_t waiters; 253 spinlock_t lock; 254 struct plist_head chain; 255} ____cacheline_aligned_in_smp; 256 257static unsigned long __read_mostly futex_hashsize; 258 259static struct futex_hash_bucket *futex_queues; 260 261static inline void futex_get_mm(union futex_key *key) 262{ 263 atomic_inc(&key->private.mm->mm_count); 264 /* 265 * Ensure futex_get_mm() implies a full barrier such that 266 * get_futex_key() implies a full barrier. This is relied upon 267 * as full barrier (B), see the ordering comment above. 268 */ 269 smp_mb__after_atomic(); 270} 271 272/* 273 * Reflects a new waiter being added to the waitqueue. 274 */ 275static inline void hb_waiters_inc(struct futex_hash_bucket *hb) 276{ 277#ifdef CONFIG_SMP 278 atomic_inc(&hb->waiters); 279 /* 280 * Full barrier (A), see the ordering comment above. 281 */ 282 smp_mb__after_atomic(); 283#endif 284} 285 286/* 287 * Reflects a waiter being removed from the waitqueue by wakeup 288 * paths. 289 */ 290static inline void hb_waiters_dec(struct futex_hash_bucket *hb) 291{ 292#ifdef CONFIG_SMP 293 atomic_dec(&hb->waiters); 294#endif 295} 296 297static inline int hb_waiters_pending(struct futex_hash_bucket *hb) 298{ 299#ifdef CONFIG_SMP 300 return atomic_read(&hb->waiters); 301#else 302 return 1; 303#endif 304} 305 306/* 307 * We hash on the keys returned from get_futex_key (see below). 308 */ 309static struct futex_hash_bucket *hash_futex(union futex_key *key) 310{ 311 u32 hash = jhash2((u32*)&key->both.word, 312 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 313 key->both.offset); 314 return &futex_queues[hash & (futex_hashsize - 1)]; 315} 316 317/* 318 * Return 1 if two futex_keys are equal, 0 otherwise. 319 */ 320static inline int match_futex(union futex_key *key1, union futex_key *key2) 321{ 322 return (key1 && key2 323 && key1->both.word == key2->both.word 324 && key1->both.ptr == key2->both.ptr 325 && key1->both.offset == key2->both.offset); 326} 327 328/* 329 * Take a reference to the resource addressed by a key. 330 * Can be called while holding spinlocks. 331 * 332 */ 333static void get_futex_key_refs(union futex_key *key) 334{ 335 if (!key->both.ptr) 336 return; 337 338 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 339 case FUT_OFF_INODE: 340 ihold(key->shared.inode); /* implies MB (B) */ 341 break; 342 case FUT_OFF_MMSHARED: 343 futex_get_mm(key); /* implies MB (B) */ 344 break; 345 default: 346 /* 347 * Private futexes do not hold reference on an inode or 348 * mm, therefore the only purpose of calling get_futex_key_refs 349 * is because we need the barrier for the lockless waiter check. 350 */ 351 smp_mb(); /* explicit MB (B) */ 352 } 353} 354 355/* 356 * Drop a reference to the resource addressed by a key. 357 * The hash bucket spinlock must not be held. This is 358 * a no-op for private futexes, see comment in the get 359 * counterpart. 360 */ 361static void drop_futex_key_refs(union futex_key *key) 362{ 363 if (!key->both.ptr) { 364 /* If we're here then we tried to put a key we failed to get */ 365 WARN_ON_ONCE(1); 366 return; 367 } 368 369 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 370 case FUT_OFF_INODE: 371 iput(key->shared.inode); 372 break; 373 case FUT_OFF_MMSHARED: 374 mmdrop(key->private.mm); 375 break; 376 } 377} 378 379/** 380 * get_futex_key() - Get parameters which are the keys for a futex 381 * @uaddr: virtual address of the futex 382 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 383 * @key: address where result is stored. 384 * @rw: mapping needs to be read/write (values: VERIFY_READ, 385 * VERIFY_WRITE) 386 * 387 * Return: a negative error code or 0 388 * 389 * The key words are stored in *key on success. 390 * 391 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 392 * offset_within_page). For private mappings, it's (uaddr, current->mm). 393 * We can usually work out the index without swapping in the page. 394 * 395 * lock_page() might sleep, the caller should not hold a spinlock. 396 */ 397static int 398get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 399{ 400 unsigned long address = (unsigned long)uaddr; 401 struct mm_struct *mm = current->mm; 402 struct page *page, *page_head; 403 int err, ro = 0; 404 405 /* 406 * The futex address must be "naturally" aligned. 407 */ 408 key->both.offset = address % PAGE_SIZE; 409 if (unlikely((address % sizeof(u32)) != 0)) 410 return -EINVAL; 411 address -= key->both.offset; 412 413 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 414 return -EFAULT; 415 416 /* 417 * PROCESS_PRIVATE futexes are fast. 418 * As the mm cannot disappear under us and the 'key' only needs 419 * virtual address, we dont even have to find the underlying vma. 420 * Note : We do have to check 'uaddr' is a valid user address, 421 * but access_ok() should be faster than find_vma() 422 */ 423 if (!fshared) { 424 key->private.mm = mm; 425 key->private.address = address; 426 get_futex_key_refs(key); /* implies MB (B) */ 427 return 0; 428 } 429 430again: 431 err = get_user_pages_fast(address, 1, 1, &page); 432 /* 433 * If write access is not required (eg. FUTEX_WAIT), try 434 * and get read-only access. 435 */ 436 if (err == -EFAULT && rw == VERIFY_READ) { 437 err = get_user_pages_fast(address, 1, 0, &page); 438 ro = 1; 439 } 440 if (err < 0) 441 return err; 442 else 443 err = 0; 444 445#ifdef CONFIG_TRANSPARENT_HUGEPAGE 446 page_head = page; 447 if (unlikely(PageTail(page))) { 448 put_page(page); 449 /* serialize against __split_huge_page_splitting() */ 450 local_irq_disable(); 451 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { 452 page_head = compound_head(page); 453 /* 454 * page_head is valid pointer but we must pin 455 * it before taking the PG_lock and/or 456 * PG_compound_lock. The moment we re-enable 457 * irqs __split_huge_page_splitting() can 458 * return and the head page can be freed from 459 * under us. We can't take the PG_lock and/or 460 * PG_compound_lock on a page that could be 461 * freed from under us. 462 */ 463 if (page != page_head) { 464 get_page(page_head); 465 put_page(page); 466 } 467 local_irq_enable(); 468 } else { 469 local_irq_enable(); 470 goto again; 471 } 472 } 473#else 474 page_head = compound_head(page); 475 if (page != page_head) { 476 get_page(page_head); 477 put_page(page); 478 } 479#endif 480 481 lock_page(page_head); 482 483 /* 484 * If page_head->mapping is NULL, then it cannot be a PageAnon 485 * page; but it might be the ZERO_PAGE or in the gate area or 486 * in a special mapping (all cases which we are happy to fail); 487 * or it may have been a good file page when get_user_pages_fast 488 * found it, but truncated or holepunched or subjected to 489 * invalidate_complete_page2 before we got the page lock (also 490 * cases which we are happy to fail). And we hold a reference, 491 * so refcount care in invalidate_complete_page's remove_mapping 492 * prevents drop_caches from setting mapping to NULL beneath us. 493 * 494 * The case we do have to guard against is when memory pressure made 495 * shmem_writepage move it from filecache to swapcache beneath us: 496 * an unlikely race, but we do need to retry for page_head->mapping. 497 */ 498 if (!page_head->mapping) { 499 int shmem_swizzled = PageSwapCache(page_head); 500 unlock_page(page_head); 501 put_page(page_head); 502 if (shmem_swizzled) 503 goto again; 504 return -EFAULT; 505 } 506 507 /* 508 * Private mappings are handled in a simple way. 509 * 510 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 511 * it's a read-only handle, it's expected that futexes attach to 512 * the object not the particular process. 513 */ 514 if (PageAnon(page_head)) { 515 /* 516 * A RO anonymous page will never change and thus doesn't make 517 * sense for futex operations. 518 */ 519 if (ro) { 520 err = -EFAULT; 521 goto out; 522 } 523 524 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 525 key->private.mm = mm; 526 key->private.address = address; 527 } else { 528 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 529 key->shared.inode = page_head->mapping->host; 530 key->shared.pgoff = basepage_index(page); 531 } 532 533 get_futex_key_refs(key); /* implies MB (B) */ 534 535out: 536 unlock_page(page_head); 537 put_page(page_head); 538 return err; 539} 540 541static inline void put_futex_key(union futex_key *key) 542{ 543 drop_futex_key_refs(key); 544} 545 546/** 547 * fault_in_user_writeable() - Fault in user address and verify RW access 548 * @uaddr: pointer to faulting user space address 549 * 550 * Slow path to fixup the fault we just took in the atomic write 551 * access to @uaddr. 552 * 553 * We have no generic implementation of a non-destructive write to the 554 * user address. We know that we faulted in the atomic pagefault 555 * disabled section so we can as well avoid the #PF overhead by 556 * calling get_user_pages() right away. 557 */ 558static int fault_in_user_writeable(u32 __user *uaddr) 559{ 560 struct mm_struct *mm = current->mm; 561 int ret; 562 563 down_read(&mm->mmap_sem); 564 ret = fixup_user_fault(current, mm, (unsigned long)uaddr, 565 FAULT_FLAG_WRITE); 566 up_read(&mm->mmap_sem); 567 568 return ret < 0 ? ret : 0; 569} 570 571/** 572 * futex_top_waiter() - Return the highest priority waiter on a futex 573 * @hb: the hash bucket the futex_q's reside in 574 * @key: the futex key (to distinguish it from other futex futex_q's) 575 * 576 * Must be called with the hb lock held. 577 */ 578static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, 579 union futex_key *key) 580{ 581 struct futex_q *this; 582 583 plist_for_each_entry(this, &hb->chain, list) { 584 if (match_futex(&this->key, key)) 585 return this; 586 } 587 return NULL; 588} 589 590static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, 591 u32 uval, u32 newval) 592{ 593 int ret; 594 595 pagefault_disable(); 596 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 597 pagefault_enable(); 598 599 return ret; 600} 601 602static int get_futex_value_locked(u32 *dest, u32 __user *from) 603{ 604 int ret; 605 606 pagefault_disable(); 607 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 608 pagefault_enable(); 609 610 return ret ? -EFAULT : 0; 611} 612 613 614/* 615 * PI code: 616 */ 617static int refill_pi_state_cache(void) 618{ 619 struct futex_pi_state *pi_state; 620 621 if (likely(current->pi_state_cache)) 622 return 0; 623 624 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 625 626 if (!pi_state) 627 return -ENOMEM; 628 629 INIT_LIST_HEAD(&pi_state->list); 630 /* pi_mutex gets initialized later */ 631 pi_state->owner = NULL; 632 atomic_set(&pi_state->refcount, 1); 633 pi_state->key = FUTEX_KEY_INIT; 634 635 current->pi_state_cache = pi_state; 636 637 return 0; 638} 639 640static struct futex_pi_state * alloc_pi_state(void) 641{ 642 struct futex_pi_state *pi_state = current->pi_state_cache; 643 644 WARN_ON(!pi_state); 645 current->pi_state_cache = NULL; 646 647 return pi_state; 648} 649 650/* 651 * Must be called with the hb lock held. 652 */ 653static void free_pi_state(struct futex_pi_state *pi_state) 654{ 655 if (!pi_state) 656 return; 657 658 if (!atomic_dec_and_test(&pi_state->refcount)) 659 return; 660 661 /* 662 * If pi_state->owner is NULL, the owner is most probably dying 663 * and has cleaned up the pi_state already 664 */ 665 if (pi_state->owner) { 666 raw_spin_lock_irq(&pi_state->owner->pi_lock); 667 list_del_init(&pi_state->list); 668 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 669 670 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 671 } 672 673 if (current->pi_state_cache) 674 kfree(pi_state); 675 else { 676 /* 677 * pi_state->list is already empty. 678 * clear pi_state->owner. 679 * refcount is at 0 - put it back to 1. 680 */ 681 pi_state->owner = NULL; 682 atomic_set(&pi_state->refcount, 1); 683 current->pi_state_cache = pi_state; 684 } 685} 686 687/* 688 * Look up the task based on what TID userspace gave us. 689 * We dont trust it. 690 */ 691static struct task_struct * futex_find_get_task(pid_t pid) 692{ 693 struct task_struct *p; 694 695 rcu_read_lock(); 696 p = find_task_by_vpid(pid); 697 if (p) 698 get_task_struct(p); 699 700 rcu_read_unlock(); 701 702 return p; 703} 704 705/* 706 * This task is holding PI mutexes at exit time => bad. 707 * Kernel cleans up PI-state, but userspace is likely hosed. 708 * (Robust-futex cleanup is separate and might save the day for userspace.) 709 */ 710void exit_pi_state_list(struct task_struct *curr) 711{ 712 struct list_head *next, *head = &curr->pi_state_list; 713 struct futex_pi_state *pi_state; 714 struct futex_hash_bucket *hb; 715 union futex_key key = FUTEX_KEY_INIT; 716 717 if (!futex_cmpxchg_enabled) 718 return; 719 /* 720 * We are a ZOMBIE and nobody can enqueue itself on 721 * pi_state_list anymore, but we have to be careful 722 * versus waiters unqueueing themselves: 723 */ 724 raw_spin_lock_irq(&curr->pi_lock); 725 while (!list_empty(head)) { 726 727 next = head->next; 728 pi_state = list_entry(next, struct futex_pi_state, list); 729 key = pi_state->key; 730 hb = hash_futex(&key); 731 raw_spin_unlock_irq(&curr->pi_lock); 732 733 spin_lock(&hb->lock); 734 735 raw_spin_lock_irq(&curr->pi_lock); 736 /* 737 * We dropped the pi-lock, so re-check whether this 738 * task still owns the PI-state: 739 */ 740 if (head->next != next) { 741 spin_unlock(&hb->lock); 742 continue; 743 } 744 745 WARN_ON(pi_state->owner != curr); 746 WARN_ON(list_empty(&pi_state->list)); 747 list_del_init(&pi_state->list); 748 pi_state->owner = NULL; 749 raw_spin_unlock_irq(&curr->pi_lock); 750 751 rt_mutex_unlock(&pi_state->pi_mutex); 752 753 spin_unlock(&hb->lock); 754 755 raw_spin_lock_irq(&curr->pi_lock); 756 } 757 raw_spin_unlock_irq(&curr->pi_lock); 758} 759 760/* 761 * We need to check the following states: 762 * 763 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 764 * 765 * [1] NULL | --- | --- | 0 | 0/1 | Valid 766 * [2] NULL | --- | --- | >0 | 0/1 | Valid 767 * 768 * [3] Found | NULL | -- | Any | 0/1 | Invalid 769 * 770 * [4] Found | Found | NULL | 0 | 1 | Valid 771 * [5] Found | Found | NULL | >0 | 1 | Invalid 772 * 773 * [6] Found | Found | task | 0 | 1 | Valid 774 * 775 * [7] Found | Found | NULL | Any | 0 | Invalid 776 * 777 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 778 * [9] Found | Found | task | 0 | 0 | Invalid 779 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 780 * 781 * [1] Indicates that the kernel can acquire the futex atomically. We 782 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 783 * 784 * [2] Valid, if TID does not belong to a kernel thread. If no matching 785 * thread is found then it indicates that the owner TID has died. 786 * 787 * [3] Invalid. The waiter is queued on a non PI futex 788 * 789 * [4] Valid state after exit_robust_list(), which sets the user space 790 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 791 * 792 * [5] The user space value got manipulated between exit_robust_list() 793 * and exit_pi_state_list() 794 * 795 * [6] Valid state after exit_pi_state_list() which sets the new owner in 796 * the pi_state but cannot access the user space value. 797 * 798 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 799 * 800 * [8] Owner and user space value match 801 * 802 * [9] There is no transient state which sets the user space TID to 0 803 * except exit_robust_list(), but this is indicated by the 804 * FUTEX_OWNER_DIED bit. See [4] 805 * 806 * [10] There is no transient state which leaves owner and user space 807 * TID out of sync. 808 */ 809 810/* 811 * Validate that the existing waiter has a pi_state and sanity check 812 * the pi_state against the user space value. If correct, attach to 813 * it. 814 */ 815static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, 816 struct futex_pi_state **ps) 817{ 818 pid_t pid = uval & FUTEX_TID_MASK; 819 820 /* 821 * Userspace might have messed up non-PI and PI futexes [3] 822 */ 823 if (unlikely(!pi_state)) 824 return -EINVAL; 825 826 WARN_ON(!atomic_read(&pi_state->refcount)); 827 828 /* 829 * Handle the owner died case: 830 */ 831 if (uval & FUTEX_OWNER_DIED) { 832 /* 833 * exit_pi_state_list sets owner to NULL and wakes the 834 * topmost waiter. The task which acquires the 835 * pi_state->rt_mutex will fixup owner. 836 */ 837 if (!pi_state->owner) { 838 /* 839 * No pi state owner, but the user space TID 840 * is not 0. Inconsistent state. [5] 841 */ 842 if (pid) 843 return -EINVAL; 844 /* 845 * Take a ref on the state and return success. [4] 846 */ 847 goto out_state; 848 } 849 850 /* 851 * If TID is 0, then either the dying owner has not 852 * yet executed exit_pi_state_list() or some waiter 853 * acquired the rtmutex in the pi state, but did not 854 * yet fixup the TID in user space. 855 * 856 * Take a ref on the state and return success. [6] 857 */ 858 if (!pid) 859 goto out_state; 860 } else { 861 /* 862 * If the owner died bit is not set, then the pi_state 863 * must have an owner. [7] 864 */ 865 if (!pi_state->owner) 866 return -EINVAL; 867 } 868 869 /* 870 * Bail out if user space manipulated the futex value. If pi 871 * state exists then the owner TID must be the same as the 872 * user space TID. [9/10] 873 */ 874 if (pid != task_pid_vnr(pi_state->owner)) 875 return -EINVAL; 876out_state: 877 atomic_inc(&pi_state->refcount); 878 *ps = pi_state; 879 return 0; 880} 881 882/* 883 * Lookup the task for the TID provided from user space and attach to 884 * it after doing proper sanity checks. 885 */ 886static int attach_to_pi_owner(u32 uval, union futex_key *key, 887 struct futex_pi_state **ps) 888{ 889 pid_t pid = uval & FUTEX_TID_MASK; 890 struct futex_pi_state *pi_state; 891 struct task_struct *p; 892 893 /* 894 * We are the first waiter - try to look up the real owner and attach 895 * the new pi_state to it, but bail out when TID = 0 [1] 896 */ 897 if (!pid) 898 return -ESRCH; 899 p = futex_find_get_task(pid); 900 if (!p) 901 return -ESRCH; 902 903 if (unlikely(p->flags & PF_KTHREAD)) { 904 put_task_struct(p); 905 return -EPERM; 906 } 907 908 /* 909 * We need to look at the task state flags to figure out, 910 * whether the task is exiting. To protect against the do_exit 911 * change of the task flags, we do this protected by 912 * p->pi_lock: 913 */ 914 raw_spin_lock_irq(&p->pi_lock); 915 if (unlikely(p->flags & PF_EXITING)) { 916 /* 917 * The task is on the way out. When PF_EXITPIDONE is 918 * set, we know that the task has finished the 919 * cleanup: 920 */ 921 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 922 923 raw_spin_unlock_irq(&p->pi_lock); 924 put_task_struct(p); 925 return ret; 926 } 927 928 /* 929 * No existing pi state. First waiter. [2] 930 */ 931 pi_state = alloc_pi_state(); 932 933 /* 934 * Initialize the pi_mutex in locked state and make @p 935 * the owner of it: 936 */ 937 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 938 939 /* Store the key for possible exit cleanups: */ 940 pi_state->key = *key; 941 942 WARN_ON(!list_empty(&pi_state->list)); 943 list_add(&pi_state->list, &p->pi_state_list); 944 pi_state->owner = p; 945 raw_spin_unlock_irq(&p->pi_lock); 946 947 put_task_struct(p); 948 949 *ps = pi_state; 950 951 return 0; 952} 953 954static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 955 union futex_key *key, struct futex_pi_state **ps) 956{ 957 struct futex_q *match = futex_top_waiter(hb, key); 958 959 /* 960 * If there is a waiter on that futex, validate it and 961 * attach to the pi_state when the validation succeeds. 962 */ 963 if (match) 964 return attach_to_pi_state(uval, match->pi_state, ps); 965 966 /* 967 * We are the first waiter - try to look up the owner based on 968 * @uval and attach to it. 969 */ 970 return attach_to_pi_owner(uval, key, ps); 971} 972 973static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 974{ 975 u32 uninitialized_var(curval); 976 977 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 978 return -EFAULT; 979 980 /*If user space value changed, let the caller retry */ 981 return curval != uval ? -EAGAIN : 0; 982} 983 984/** 985 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 986 * @uaddr: the pi futex user address 987 * @hb: the pi futex hash bucket 988 * @key: the futex key associated with uaddr and hb 989 * @ps: the pi_state pointer where we store the result of the 990 * lookup 991 * @task: the task to perform the atomic lock work for. This will 992 * be "current" except in the case of requeue pi. 993 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 994 * 995 * Return: 996 * 0 - ready to wait; 997 * 1 - acquired the lock; 998 * <0 - error 999 * 1000 * The hb->lock and futex_key refs shall be held by the caller. 1001 */ 1002static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 1003 union futex_key *key, 1004 struct futex_pi_state **ps, 1005 struct task_struct *task, int set_waiters) 1006{ 1007 u32 uval, newval, vpid = task_pid_vnr(task); 1008 struct futex_q *match; 1009 int ret; 1010 1011 /* 1012 * Read the user space value first so we can validate a few 1013 * things before proceeding further. 1014 */ 1015 if (get_futex_value_locked(&uval, uaddr)) 1016 return -EFAULT; 1017 1018 /* 1019 * Detect deadlocks. 1020 */ 1021 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 1022 return -EDEADLK; 1023 1024 /* 1025 * Lookup existing state first. If it exists, try to attach to 1026 * its pi_state. 1027 */ 1028 match = futex_top_waiter(hb, key); 1029 if (match) 1030 return attach_to_pi_state(uval, match->pi_state, ps); 1031 1032 /* 1033 * No waiter and user TID is 0. We are here because the 1034 * waiters or the owner died bit is set or called from 1035 * requeue_cmp_pi or for whatever reason something took the 1036 * syscall. 1037 */ 1038 if (!(uval & FUTEX_TID_MASK)) { 1039 /* 1040 * We take over the futex. No other waiters and the user space 1041 * TID is 0. We preserve the owner died bit. 1042 */ 1043 newval = uval & FUTEX_OWNER_DIED; 1044 newval |= vpid; 1045 1046 /* The futex requeue_pi code can enforce the waiters bit */ 1047 if (set_waiters) 1048 newval |= FUTEX_WAITERS; 1049 1050 ret = lock_pi_update_atomic(uaddr, uval, newval); 1051 /* If the take over worked, return 1 */ 1052 return ret < 0 ? ret : 1; 1053 } 1054 1055 /* 1056 * First waiter. Set the waiters bit before attaching ourself to 1057 * the owner. If owner tries to unlock, it will be forced into 1058 * the kernel and blocked on hb->lock. 1059 */ 1060 newval = uval | FUTEX_WAITERS; 1061 ret = lock_pi_update_atomic(uaddr, uval, newval); 1062 if (ret) 1063 return ret; 1064 /* 1065 * If the update of the user space value succeeded, we try to 1066 * attach to the owner. If that fails, no harm done, we only 1067 * set the FUTEX_WAITERS bit in the user space variable. 1068 */ 1069 return attach_to_pi_owner(uval, key, ps); 1070} 1071 1072/** 1073 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket 1074 * @q: The futex_q to unqueue 1075 * 1076 * The q->lock_ptr must not be NULL and must be held by the caller. 1077 */ 1078static void __unqueue_futex(struct futex_q *q) 1079{ 1080 struct futex_hash_bucket *hb; 1081 1082 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) 1083 || WARN_ON(plist_node_empty(&q->list))) 1084 return; 1085 1086 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 1087 plist_del(&q->list, &hb->chain); 1088 hb_waiters_dec(hb); 1089} 1090 1091/* 1092 * The hash bucket lock must be held when this is called. 1093 * Afterwards, the futex_q must not be accessed. 1094 */ 1095static void wake_futex(struct futex_q *q) 1096{ 1097 struct task_struct *p = q->task; 1098 1099 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 1100 return; 1101 1102 /* 1103 * We set q->lock_ptr = NULL _before_ we wake up the task. If 1104 * a non-futex wake up happens on another CPU then the task 1105 * might exit and p would dereference a non-existing task 1106 * struct. Prevent this by holding a reference on p across the 1107 * wake up. 1108 */ 1109 get_task_struct(p); 1110 1111 __unqueue_futex(q); 1112 /* 1113 * The waiting task can free the futex_q as soon as 1114 * q->lock_ptr = NULL is written, without taking any locks. A 1115 * memory barrier is required here to prevent the following 1116 * store to lock_ptr from getting ahead of the plist_del. 1117 */ 1118 smp_wmb(); 1119 q->lock_ptr = NULL; 1120 1121 wake_up_state(p, TASK_NORMAL); 1122 put_task_struct(p); 1123} 1124 1125static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 1126{ 1127 struct task_struct *new_owner; 1128 struct futex_pi_state *pi_state = this->pi_state; 1129 u32 uninitialized_var(curval), newval; 1130 int ret = 0; 1131 1132 if (!pi_state) 1133 return -EINVAL; 1134 1135 /* 1136 * If current does not own the pi_state then the futex is 1137 * inconsistent and user space fiddled with the futex value. 1138 */ 1139 if (pi_state->owner != current) 1140 return -EINVAL; 1141 1142 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 1143 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1144 1145 /* 1146 * It is possible that the next waiter (the one that brought 1147 * this owner to the kernel) timed out and is no longer 1148 * waiting on the lock. 1149 */ 1150 if (!new_owner) 1151 new_owner = this->task; 1152 1153 /* 1154 * We pass it to the next owner. The WAITERS bit is always 1155 * kept enabled while there is PI state around. We cleanup the 1156 * owner died bit, because we are the owner. 1157 */ 1158 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1159 1160 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1161 ret = -EFAULT; 1162 } else if (curval != uval) { 1163 /* 1164 * If a unconditional UNLOCK_PI operation (user space did not 1165 * try the TID->0 transition) raced with a waiter setting the 1166 * FUTEX_WAITERS flag between get_user() and locking the hash 1167 * bucket lock, retry the operation. 1168 */ 1169 if ((FUTEX_TID_MASK & curval) == uval) 1170 ret = -EAGAIN; 1171 else 1172 ret = -EINVAL; 1173 } 1174 if (ret) { 1175 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1176 return ret; 1177 } 1178 1179 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1180 WARN_ON(list_empty(&pi_state->list)); 1181 list_del_init(&pi_state->list); 1182 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 1183 1184 raw_spin_lock_irq(&new_owner->pi_lock); 1185 WARN_ON(!list_empty(&pi_state->list)); 1186 list_add(&pi_state->list, &new_owner->pi_state_list); 1187 pi_state->owner = new_owner; 1188 raw_spin_unlock_irq(&new_owner->pi_lock); 1189 1190 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1191 rt_mutex_unlock(&pi_state->pi_mutex); 1192 1193 return 0; 1194} 1195 1196/* 1197 * Express the locking dependencies for lockdep: 1198 */ 1199static inline void 1200double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1201{ 1202 if (hb1 <= hb2) { 1203 spin_lock(&hb1->lock); 1204 if (hb1 < hb2) 1205 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 1206 } else { /* hb1 > hb2 */ 1207 spin_lock(&hb2->lock); 1208 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 1209 } 1210} 1211 1212static inline void 1213double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1214{ 1215 spin_unlock(&hb1->lock); 1216 if (hb1 != hb2) 1217 spin_unlock(&hb2->lock); 1218} 1219 1220/* 1221 * Wake up waiters matching bitset queued on this futex (uaddr). 1222 */ 1223static int 1224futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 1225{ 1226 struct futex_hash_bucket *hb; 1227 struct futex_q *this, *next; 1228 union futex_key key = FUTEX_KEY_INIT; 1229 int ret; 1230 1231 if (!bitset) 1232 return -EINVAL; 1233 1234 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); 1235 if (unlikely(ret != 0)) 1236 goto out; 1237 1238 hb = hash_futex(&key); 1239 1240 /* Make sure we really have tasks to wakeup */ 1241 if (!hb_waiters_pending(hb)) 1242 goto out_put_key; 1243 1244 spin_lock(&hb->lock); 1245 1246 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1247 if (match_futex (&this->key, &key)) { 1248 if (this->pi_state || this->rt_waiter) { 1249 ret = -EINVAL; 1250 break; 1251 } 1252 1253 /* Check if one of the bits is set in both bitsets */ 1254 if (!(this->bitset & bitset)) 1255 continue; 1256 1257 wake_futex(this); 1258 if (++ret >= nr_wake) 1259 break; 1260 } 1261 } 1262 1263 spin_unlock(&hb->lock); 1264out_put_key: 1265 put_futex_key(&key); 1266out: 1267 return ret; 1268} 1269 1270/* 1271 * Wake up all waiters hashed on the physical page that is mapped 1272 * to this virtual address: 1273 */ 1274static int 1275futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 1276 int nr_wake, int nr_wake2, int op) 1277{ 1278 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1279 struct futex_hash_bucket *hb1, *hb2; 1280 struct futex_q *this, *next; 1281 int ret, op_ret; 1282 1283retry: 1284 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1285 if (unlikely(ret != 0)) 1286 goto out; 1287 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 1288 if (unlikely(ret != 0)) 1289 goto out_put_key1; 1290 1291 hb1 = hash_futex(&key1); 1292 hb2 = hash_futex(&key2); 1293 1294retry_private: 1295 double_lock_hb(hb1, hb2); 1296 op_ret = futex_atomic_op_inuser(op, uaddr2); 1297 if (unlikely(op_ret < 0)) { 1298 1299 double_unlock_hb(hb1, hb2); 1300 1301#ifndef CONFIG_MMU 1302 /* 1303 * we don't get EFAULT from MMU faults if we don't have an MMU, 1304 * but we might get them from range checking 1305 */ 1306 ret = op_ret; 1307 goto out_put_keys; 1308#endif 1309 1310 if (unlikely(op_ret != -EFAULT)) { 1311 ret = op_ret; 1312 goto out_put_keys; 1313 } 1314 1315 ret = fault_in_user_writeable(uaddr2); 1316 if (ret) 1317 goto out_put_keys; 1318 1319 if (!(flags & FLAGS_SHARED)) 1320 goto retry_private; 1321 1322 put_futex_key(&key2); 1323 put_futex_key(&key1); 1324 goto retry; 1325 } 1326 1327 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1328 if (match_futex (&this->key, &key1)) { 1329 if (this->pi_state || this->rt_waiter) { 1330 ret = -EINVAL; 1331 goto out_unlock; 1332 } 1333 wake_futex(this); 1334 if (++ret >= nr_wake) 1335 break; 1336 } 1337 } 1338 1339 if (op_ret > 0) { 1340 op_ret = 0; 1341 plist_for_each_entry_safe(this, next, &hb2->chain, list) { 1342 if (match_futex (&this->key, &key2)) { 1343 if (this->pi_state || this->rt_waiter) { 1344 ret = -EINVAL; 1345 goto out_unlock; 1346 } 1347 wake_futex(this); 1348 if (++op_ret >= nr_wake2) 1349 break; 1350 } 1351 } 1352 ret += op_ret; 1353 } 1354 1355out_unlock: 1356 double_unlock_hb(hb1, hb2); 1357out_put_keys: 1358 put_futex_key(&key2); 1359out_put_key1: 1360 put_futex_key(&key1); 1361out: 1362 return ret; 1363} 1364 1365/** 1366 * requeue_futex() - Requeue a futex_q from one hb to another 1367 * @q: the futex_q to requeue 1368 * @hb1: the source hash_bucket 1369 * @hb2: the target hash_bucket 1370 * @key2: the new key for the requeued futex_q 1371 */ 1372static inline 1373void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, 1374 struct futex_hash_bucket *hb2, union futex_key *key2) 1375{ 1376 1377 /* 1378 * If key1 and key2 hash to the same bucket, no need to 1379 * requeue. 1380 */ 1381 if (likely(&hb1->chain != &hb2->chain)) { 1382 plist_del(&q->list, &hb1->chain); 1383 hb_waiters_dec(hb1); 1384 plist_add(&q->list, &hb2->chain); 1385 hb_waiters_inc(hb2); 1386 q->lock_ptr = &hb2->lock; 1387 } 1388 get_futex_key_refs(key2); 1389 q->key = *key2; 1390} 1391 1392/** 1393 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1394 * @q: the futex_q 1395 * @key: the key of the requeue target futex 1396 * @hb: the hash_bucket of the requeue target futex 1397 * 1398 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1399 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1400 * to the requeue target futex so the waiter can detect the wakeup on the right 1401 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1402 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock 1403 * to protect access to the pi_state to fixup the owner later. Must be called 1404 * with both q->lock_ptr and hb->lock held. 1405 */ 1406static inline 1407void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1408 struct futex_hash_bucket *hb) 1409{ 1410 get_futex_key_refs(key); 1411 q->key = *key; 1412 1413 __unqueue_futex(q); 1414 1415 WARN_ON(!q->rt_waiter); 1416 q->rt_waiter = NULL; 1417 1418 q->lock_ptr = &hb->lock; 1419 1420 wake_up_state(q->task, TASK_NORMAL); 1421} 1422 1423/** 1424 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter 1425 * @pifutex: the user address of the to futex 1426 * @hb1: the from futex hash bucket, must be locked by the caller 1427 * @hb2: the to futex hash bucket, must be locked by the caller 1428 * @key1: the from futex key 1429 * @key2: the to futex key 1430 * @ps: address to store the pi_state pointer 1431 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1432 * 1433 * Try and get the lock on behalf of the top waiter if we can do it atomically. 1434 * Wake the top waiter if we succeed. If the caller specified set_waiters, 1435 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1436 * hb1 and hb2 must be held by the caller. 1437 * 1438 * Return: 1439 * 0 - failed to acquire the lock atomically; 1440 * >0 - acquired the lock, return value is vpid of the top_waiter 1441 * <0 - error 1442 */ 1443static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1444 struct futex_hash_bucket *hb1, 1445 struct futex_hash_bucket *hb2, 1446 union futex_key *key1, union futex_key *key2, 1447 struct futex_pi_state **ps, int set_waiters) 1448{ 1449 struct futex_q *top_waiter = NULL; 1450 u32 curval; 1451 int ret, vpid; 1452 1453 if (get_futex_value_locked(&curval, pifutex)) 1454 return -EFAULT; 1455 1456 /* 1457 * Find the top_waiter and determine if there are additional waiters. 1458 * If the caller intends to requeue more than 1 waiter to pifutex, 1459 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, 1460 * as we have means to handle the possible fault. If not, don't set 1461 * the bit unecessarily as it will force the subsequent unlock to enter 1462 * the kernel. 1463 */ 1464 top_waiter = futex_top_waiter(hb1, key1); 1465 1466 /* There are no waiters, nothing for us to do. */ 1467 if (!top_waiter) 1468 return 0; 1469 1470 /* Ensure we requeue to the expected futex. */ 1471 if (!match_futex(top_waiter->requeue_pi_key, key2)) 1472 return -EINVAL; 1473 1474 /* 1475 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1476 * the contended case or if set_waiters is 1. The pi_state is returned 1477 * in ps in contended cases. 1478 */ 1479 vpid = task_pid_vnr(top_waiter->task); 1480 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1481 set_waiters); 1482 if (ret == 1) { 1483 requeue_pi_wake_futex(top_waiter, key2, hb2); 1484 return vpid; 1485 } 1486 return ret; 1487} 1488 1489/** 1490 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1491 * @uaddr1: source futex user address 1492 * @flags: futex flags (FLAGS_SHARED, etc.) 1493 * @uaddr2: target futex user address 1494 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1495 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1496 * @cmpval: @uaddr1 expected value (or %NULL) 1497 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1498 * pi futex (pi to pi requeue is not supported) 1499 * 1500 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1501 * uaddr2 atomically on behalf of the top waiter. 1502 * 1503 * Return: 1504 * >=0 - on success, the number of tasks requeued or woken; 1505 * <0 - on error 1506 */ 1507static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1508 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1509 u32 *cmpval, int requeue_pi) 1510{ 1511 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1512 int drop_count = 0, task_count = 0, ret; 1513 struct futex_pi_state *pi_state = NULL; 1514 struct futex_hash_bucket *hb1, *hb2; 1515 struct futex_q *this, *next; 1516 1517 if (requeue_pi) { 1518 /* 1519 * Requeue PI only works on two distinct uaddrs. This 1520 * check is only valid for private futexes. See below. 1521 */ 1522 if (uaddr1 == uaddr2) 1523 return -EINVAL; 1524 1525 /* 1526 * requeue_pi requires a pi_state, try to allocate it now 1527 * without any locks in case it fails. 1528 */ 1529 if (refill_pi_state_cache()) 1530 return -ENOMEM; 1531 /* 1532 * requeue_pi must wake as many tasks as it can, up to nr_wake 1533 * + nr_requeue, since it acquires the rt_mutex prior to 1534 * returning to userspace, so as to not leave the rt_mutex with 1535 * waiters and no owner. However, second and third wake-ups 1536 * cannot be predicted as they involve race conditions with the 1537 * first wake and a fault while looking up the pi_state. Both 1538 * pthread_cond_signal() and pthread_cond_broadcast() should 1539 * use nr_wake=1. 1540 */ 1541 if (nr_wake != 1) 1542 return -EINVAL; 1543 } 1544 1545retry: 1546 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1547 if (unlikely(ret != 0)) 1548 goto out; 1549 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, 1550 requeue_pi ? VERIFY_WRITE : VERIFY_READ); 1551 if (unlikely(ret != 0)) 1552 goto out_put_key1; 1553 1554 /* 1555 * The check above which compares uaddrs is not sufficient for 1556 * shared futexes. We need to compare the keys: 1557 */ 1558 if (requeue_pi && match_futex(&key1, &key2)) { 1559 ret = -EINVAL; 1560 goto out_put_keys; 1561 } 1562 1563 hb1 = hash_futex(&key1); 1564 hb2 = hash_futex(&key2); 1565 1566retry_private: 1567 hb_waiters_inc(hb2); 1568 double_lock_hb(hb1, hb2); 1569 1570 if (likely(cmpval != NULL)) { 1571 u32 curval; 1572 1573 ret = get_futex_value_locked(&curval, uaddr1); 1574 1575 if (unlikely(ret)) { 1576 double_unlock_hb(hb1, hb2); 1577 hb_waiters_dec(hb2); 1578 1579 ret = get_user(curval, uaddr1); 1580 if (ret) 1581 goto out_put_keys; 1582 1583 if (!(flags & FLAGS_SHARED)) 1584 goto retry_private; 1585 1586 put_futex_key(&key2); 1587 put_futex_key(&key1); 1588 goto retry; 1589 } 1590 if (curval != *cmpval) { 1591 ret = -EAGAIN; 1592 goto out_unlock; 1593 } 1594 } 1595 1596 if (requeue_pi && (task_count - nr_wake < nr_requeue)) { 1597 /* 1598 * Attempt to acquire uaddr2 and wake the top waiter. If we 1599 * intend to requeue waiters, force setting the FUTEX_WAITERS 1600 * bit. We force this here where we are able to easily handle 1601 * faults rather in the requeue loop below. 1602 */ 1603 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 1604 &key2, &pi_state, nr_requeue); 1605 1606 /* 1607 * At this point the top_waiter has either taken uaddr2 or is 1608 * waiting on it. If the former, then the pi_state will not 1609 * exist yet, look it up one more time to ensure we have a 1610 * reference to it. If the lock was taken, ret contains the 1611 * vpid of the top waiter task. 1612 */ 1613 if (ret > 0) { 1614 WARN_ON(pi_state); 1615 drop_count++; 1616 task_count++; 1617 /* 1618 * If we acquired the lock, then the user 1619 * space value of uaddr2 should be vpid. It 1620 * cannot be changed by the top waiter as it 1621 * is blocked on hb2 lock if it tries to do 1622 * so. If something fiddled with it behind our 1623 * back the pi state lookup might unearth 1624 * it. So we rather use the known value than 1625 * rereading and handing potential crap to 1626 * lookup_pi_state. 1627 */ 1628 ret = lookup_pi_state(ret, hb2, &key2, &pi_state); 1629 } 1630 1631 switch (ret) { 1632 case 0: 1633 break; 1634 case -EFAULT: 1635 free_pi_state(pi_state); 1636 pi_state = NULL; 1637 double_unlock_hb(hb1, hb2); 1638 hb_waiters_dec(hb2); 1639 put_futex_key(&key2); 1640 put_futex_key(&key1); 1641 ret = fault_in_user_writeable(uaddr2); 1642 if (!ret) 1643 goto retry; 1644 goto out; 1645 case -EAGAIN: 1646 /* 1647 * Two reasons for this: 1648 * - Owner is exiting and we just wait for the 1649 * exit to complete. 1650 * - The user space value changed. 1651 */ 1652 free_pi_state(pi_state); 1653 pi_state = NULL; 1654 double_unlock_hb(hb1, hb2); 1655 hb_waiters_dec(hb2); 1656 put_futex_key(&key2); 1657 put_futex_key(&key1); 1658 cond_resched(); 1659 goto retry; 1660 default: 1661 goto out_unlock; 1662 } 1663 } 1664 1665 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1666 if (task_count - nr_wake >= nr_requeue) 1667 break; 1668 1669 if (!match_futex(&this->key, &key1)) 1670 continue; 1671 1672 /* 1673 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1674 * be paired with each other and no other futex ops. 1675 * 1676 * We should never be requeueing a futex_q with a pi_state, 1677 * which is awaiting a futex_unlock_pi(). 1678 */ 1679 if ((requeue_pi && !this->rt_waiter) || 1680 (!requeue_pi && this->rt_waiter) || 1681 this->pi_state) { 1682 ret = -EINVAL; 1683 break; 1684 } 1685 1686 /* 1687 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1688 * lock, we already woke the top_waiter. If not, it will be 1689 * woken by futex_unlock_pi(). 1690 */ 1691 if (++task_count <= nr_wake && !requeue_pi) { 1692 wake_futex(this); 1693 continue; 1694 } 1695 1696 /* Ensure we requeue to the expected futex for requeue_pi. */ 1697 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { 1698 ret = -EINVAL; 1699 break; 1700 } 1701 1702 /* 1703 * Requeue nr_requeue waiters and possibly one more in the case 1704 * of requeue_pi if we couldn't acquire the lock atomically. 1705 */ 1706 if (requeue_pi) { 1707 /* Prepare the waiter to take the rt_mutex. */ 1708 atomic_inc(&pi_state->refcount); 1709 this->pi_state = pi_state; 1710 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1711 this->rt_waiter, 1712 this->task); 1713 if (ret == 1) { 1714 /* We got the lock. */ 1715 requeue_pi_wake_futex(this, &key2, hb2); 1716 drop_count++; 1717 continue; 1718 } else if (ret) { 1719 /* -EDEADLK */ 1720 this->pi_state = NULL; 1721 free_pi_state(pi_state); 1722 goto out_unlock; 1723 } 1724 } 1725 requeue_futex(this, hb1, hb2, &key2); 1726 drop_count++; 1727 } 1728 1729out_unlock: 1730 free_pi_state(pi_state); 1731 double_unlock_hb(hb1, hb2); 1732 hb_waiters_dec(hb2); 1733 1734 /* 1735 * drop_futex_key_refs() must be called outside the spinlocks. During 1736 * the requeue we moved futex_q's from the hash bucket at key1 to the 1737 * one at key2 and updated their key pointer. We no longer need to 1738 * hold the references to key1. 1739 */ 1740 while (--drop_count >= 0) 1741 drop_futex_key_refs(&key1); 1742 1743out_put_keys: 1744 put_futex_key(&key2); 1745out_put_key1: 1746 put_futex_key(&key1); 1747out: 1748 return ret ? ret : task_count; 1749} 1750 1751/* The key must be already stored in q->key. */ 1752static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1753 __acquires(&hb->lock) 1754{ 1755 struct futex_hash_bucket *hb; 1756 1757 hb = hash_futex(&q->key); 1758 1759 /* 1760 * Increment the counter before taking the lock so that 1761 * a potential waker won't miss a to-be-slept task that is 1762 * waiting for the spinlock. This is safe as all queue_lock() 1763 * users end up calling queue_me(). Similarly, for housekeeping, 1764 * decrement the counter at queue_unlock() when some error has 1765 * occurred and we don't end up adding the task to the list. 1766 */ 1767 hb_waiters_inc(hb); 1768 1769 q->lock_ptr = &hb->lock; 1770 1771 spin_lock(&hb->lock); /* implies MB (A) */ 1772 return hb; 1773} 1774 1775static inline void 1776queue_unlock(struct futex_hash_bucket *hb) 1777 __releases(&hb->lock) 1778{ 1779 spin_unlock(&hb->lock); 1780 hb_waiters_dec(hb); 1781} 1782 1783/** 1784 * queue_me() - Enqueue the futex_q on the futex_hash_bucket 1785 * @q: The futex_q to enqueue 1786 * @hb: The destination hash bucket 1787 * 1788 * The hb->lock must be held by the caller, and is released here. A call to 1789 * queue_me() is typically paired with exactly one call to unqueue_me(). The 1790 * exceptions involve the PI related operations, which may use unqueue_me_pi() 1791 * or nothing if the unqueue is done as part of the wake process and the unqueue 1792 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for 1793 * an example). 1794 */ 1795static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1796 __releases(&hb->lock) 1797{ 1798 int prio; 1799 1800 /* 1801 * The priority used to register this element is 1802 * - either the real thread-priority for the real-time threads 1803 * (i.e. threads with a priority lower than MAX_RT_PRIO) 1804 * - or MAX_RT_PRIO for non-RT threads. 1805 * Thus, all RT-threads are woken first in priority order, and 1806 * the others are woken last, in FIFO order. 1807 */ 1808 prio = min(current->normal_prio, MAX_RT_PRIO); 1809 1810 plist_node_init(&q->list, prio); 1811 plist_add(&q->list, &hb->chain); 1812 q->task = current; 1813 spin_unlock(&hb->lock); 1814} 1815 1816/** 1817 * unqueue_me() - Remove the futex_q from its futex_hash_bucket 1818 * @q: The futex_q to unqueue 1819 * 1820 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1821 * be paired with exactly one earlier call to queue_me(). 1822 * 1823 * Return: 1824 * 1 - if the futex_q was still queued (and we removed unqueued it); 1825 * 0 - if the futex_q was already removed by the waking thread 1826 */ 1827static int unqueue_me(struct futex_q *q) 1828{ 1829 spinlock_t *lock_ptr; 1830 int ret = 0; 1831 1832 /* In the common case we don't take the spinlock, which is nice. */ 1833retry: 1834 lock_ptr = q->lock_ptr; 1835 barrier(); 1836 if (lock_ptr != NULL) { 1837 spin_lock(lock_ptr); 1838 /* 1839 * q->lock_ptr can change between reading it and 1840 * spin_lock(), causing us to take the wrong lock. This 1841 * corrects the race condition. 1842 * 1843 * Reasoning goes like this: if we have the wrong lock, 1844 * q->lock_ptr must have changed (maybe several times) 1845 * between reading it and the spin_lock(). It can 1846 * change again after the spin_lock() but only if it was 1847 * already changed before the spin_lock(). It cannot, 1848 * however, change back to the original value. Therefore 1849 * we can detect whether we acquired the correct lock. 1850 */ 1851 if (unlikely(lock_ptr != q->lock_ptr)) { 1852 spin_unlock(lock_ptr); 1853 goto retry; 1854 } 1855 __unqueue_futex(q); 1856 1857 BUG_ON(q->pi_state); 1858 1859 spin_unlock(lock_ptr); 1860 ret = 1; 1861 } 1862 1863 drop_futex_key_refs(&q->key); 1864 return ret; 1865} 1866 1867/* 1868 * PI futexes can not be requeued and must remove themself from the 1869 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 1870 * and dropped here. 1871 */ 1872static void unqueue_me_pi(struct futex_q *q) 1873 __releases(q->lock_ptr) 1874{ 1875 __unqueue_futex(q); 1876 1877 BUG_ON(!q->pi_state); 1878 free_pi_state(q->pi_state); 1879 q->pi_state = NULL; 1880 1881 spin_unlock(q->lock_ptr); 1882} 1883 1884/* 1885 * Fixup the pi_state owner with the new owner. 1886 * 1887 * Must be called with hash bucket lock held and mm->sem held for non 1888 * private futexes. 1889 */ 1890static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1891 struct task_struct *newowner) 1892{ 1893 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1894 struct futex_pi_state *pi_state = q->pi_state; 1895 struct task_struct *oldowner = pi_state->owner; 1896 u32 uval, uninitialized_var(curval), newval; 1897 int ret; 1898 1899 /* Owner died? */ 1900 if (!pi_state->owner) 1901 newtid |= FUTEX_OWNER_DIED; 1902 1903 /* 1904 * We are here either because we stole the rtmutex from the 1905 * previous highest priority waiter or we are the highest priority 1906 * waiter but failed to get the rtmutex the first time. 1907 * We have to replace the newowner TID in the user space variable. 1908 * This must be atomic as we have to preserve the owner died bit here. 1909 * 1910 * Note: We write the user space value _before_ changing the pi_state 1911 * because we can fault here. Imagine swapped out pages or a fork 1912 * that marked all the anonymous memory readonly for cow. 1913 * 1914 * Modifying pi_state _before_ the user space value would 1915 * leave the pi_state in an inconsistent state when we fault 1916 * here, because we need to drop the hash bucket lock to 1917 * handle the fault. This might be observed in the PID check 1918 * in lookup_pi_state. 1919 */ 1920retry: 1921 if (get_futex_value_locked(&uval, uaddr)) 1922 goto handle_fault; 1923 1924 while (1) { 1925 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1926 1927 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1928 goto handle_fault; 1929 if (curval == uval) 1930 break; 1931 uval = curval; 1932 } 1933 1934 /* 1935 * We fixed up user space. Now we need to fix the pi_state 1936 * itself. 1937 */ 1938 if (pi_state->owner != NULL) { 1939 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1940 WARN_ON(list_empty(&pi_state->list)); 1941 list_del_init(&pi_state->list); 1942 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 1943 } 1944 1945 pi_state->owner = newowner; 1946 1947 raw_spin_lock_irq(&newowner->pi_lock); 1948 WARN_ON(!list_empty(&pi_state->list)); 1949 list_add(&pi_state->list, &newowner->pi_state_list); 1950 raw_spin_unlock_irq(&newowner->pi_lock); 1951 return 0; 1952 1953 /* 1954 * To handle the page fault we need to drop the hash bucket 1955 * lock here. That gives the other task (either the highest priority 1956 * waiter itself or the task which stole the rtmutex) the 1957 * chance to try the fixup of the pi_state. So once we are 1958 * back from handling the fault we need to check the pi_state 1959 * after reacquiring the hash bucket lock and before trying to 1960 * do another fixup. When the fixup has been done already we 1961 * simply return. 1962 */ 1963handle_fault: 1964 spin_unlock(q->lock_ptr); 1965 1966 ret = fault_in_user_writeable(uaddr); 1967 1968 spin_lock(q->lock_ptr); 1969 1970 /* 1971 * Check if someone else fixed it for us: 1972 */ 1973 if (pi_state->owner != oldowner) 1974 return 0; 1975 1976 if (ret) 1977 return ret; 1978 1979 goto retry; 1980} 1981 1982static long futex_wait_restart(struct restart_block *restart); 1983 1984/** 1985 * fixup_owner() - Post lock pi_state and corner case management 1986 * @uaddr: user address of the futex 1987 * @q: futex_q (contains pi_state and access to the rt_mutex) 1988 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1989 * 1990 * After attempting to lock an rt_mutex, this function is called to cleanup 1991 * the pi_state owner as well as handle race conditions that may allow us to 1992 * acquire the lock. Must be called with the hb lock held. 1993 * 1994 * Return: 1995 * 1 - success, lock taken; 1996 * 0 - success, lock not taken; 1997 * <0 - on error (-EFAULT) 1998 */ 1999static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2000{ 2001 struct task_struct *owner; 2002 int ret = 0; 2003 2004 if (locked) { 2005 /* 2006 * Got the lock. We might not be the anticipated owner if we 2007 * did a lock-steal - fix up the PI-state in that case: 2008 */ 2009 if (q->pi_state->owner != current) 2010 ret = fixup_pi_state_owner(uaddr, q, current); 2011 goto out; 2012 } 2013 2014 /* 2015 * Catch the rare case, where the lock was released when we were on the 2016 * way back before we locked the hash bucket. 2017 */ 2018 if (q->pi_state->owner == current) { 2019 /* 2020 * Try to get the rt_mutex now. This might fail as some other 2021 * task acquired the rt_mutex after we removed ourself from the 2022 * rt_mutex waiters list. 2023 */ 2024 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { 2025 locked = 1; 2026 goto out; 2027 } 2028 2029 /* 2030 * pi_state is incorrect, some other task did a lock steal and 2031 * we returned due to timeout or signal without taking the 2032 * rt_mutex. Too late. 2033 */ 2034 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); 2035 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 2036 if (!owner) 2037 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); 2038 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); 2039 ret = fixup_pi_state_owner(uaddr, q, owner); 2040 goto out; 2041 } 2042 2043 /* 2044 * Paranoia check. If we did not take the lock, then we should not be 2045 * the owner of the rt_mutex. 2046 */ 2047 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 2048 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 2049 "pi-state %p\n", ret, 2050 q->pi_state->pi_mutex.owner, 2051 q->pi_state->owner); 2052 2053out: 2054 return ret ? ret : locked; 2055} 2056 2057/** 2058 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal 2059 * @hb: the futex hash bucket, must be locked by the caller 2060 * @q: the futex_q to queue up on 2061 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 2062 */ 2063static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 2064 struct hrtimer_sleeper *timeout) 2065{ 2066 /* 2067 * The task state is guaranteed to be set before another task can 2068 * wake it. set_current_state() is implemented using set_mb() and 2069 * queue_me() calls spin_unlock() upon completion, both serializing 2070 * access to the hash list and forcing another memory barrier. 2071 */ 2072 set_current_state(TASK_INTERRUPTIBLE); 2073 queue_me(q, hb); 2074 2075 /* Arm the timer */ 2076 if (timeout) { 2077 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); 2078 if (!hrtimer_active(&timeout->timer)) 2079 timeout->task = NULL; 2080 } 2081 2082 /* 2083 * If we have been removed from the hash list, then another task 2084 * has tried to wake us, and we can skip the call to schedule(). 2085 */ 2086 if (likely(!plist_node_empty(&q->list))) { 2087 /* 2088 * If the timer has already expired, current will already be 2089 * flagged for rescheduling. Only call schedule if there 2090 * is no timeout, or if it has yet to expire. 2091 */ 2092 if (!timeout || timeout->task) 2093 freezable_schedule(); 2094 } 2095 __set_current_state(TASK_RUNNING); 2096} 2097 2098/** 2099 * futex_wait_setup() - Prepare to wait on a futex 2100 * @uaddr: the futex userspace address 2101 * @val: the expected value 2102 * @flags: futex flags (FLAGS_SHARED, etc.) 2103 * @q: the associated futex_q 2104 * @hb: storage for hash_bucket pointer to be returned to caller 2105 * 2106 * Setup the futex_q and locate the hash_bucket. Get the futex value and 2107 * compare it with the expected value. Handle atomic faults internally. 2108 * Return with the hb lock held and a q.key reference on success, and unlocked 2109 * with no q.key reference on failure. 2110 * 2111 * Return: 2112 * 0 - uaddr contains val and hb has been locked; 2113 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2114 */ 2115static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2116 struct futex_q *q, struct futex_hash_bucket **hb) 2117{ 2118 u32 uval; 2119 int ret; 2120 2121 /* 2122 * Access the page AFTER the hash-bucket is locked. 2123 * Order is important: 2124 * 2125 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 2126 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 2127 * 2128 * The basic logical guarantee of a futex is that it blocks ONLY 2129 * if cond(var) is known to be true at the time of blocking, for 2130 * any cond. If we locked the hash-bucket after testing *uaddr, that 2131 * would open a race condition where we could block indefinitely with 2132 * cond(var) false, which would violate the guarantee. 2133 * 2134 * On the other hand, we insert q and release the hash-bucket only 2135 * after testing *uaddr. This guarantees that futex_wait() will NOT 2136 * absorb a wakeup if *uaddr does not match the desired values 2137 * while the syscall executes. 2138 */ 2139retry: 2140 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); 2141 if (unlikely(ret != 0)) 2142 return ret; 2143 2144retry_private: 2145 *hb = queue_lock(q); 2146 2147 ret = get_futex_value_locked(&uval, uaddr); 2148 2149 if (ret) { 2150 queue_unlock(*hb); 2151 2152 ret = get_user(uval, uaddr); 2153 if (ret) 2154 goto out; 2155 2156 if (!(flags & FLAGS_SHARED)) 2157 goto retry_private; 2158 2159 put_futex_key(&q->key); 2160 goto retry; 2161 } 2162 2163 if (uval != val) { 2164 queue_unlock(*hb); 2165 ret = -EWOULDBLOCK; 2166 } 2167 2168out: 2169 if (ret) 2170 put_futex_key(&q->key); 2171 return ret; 2172} 2173 2174static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2175 ktime_t *abs_time, u32 bitset) 2176{ 2177 struct hrtimer_sleeper timeout, *to = NULL; 2178 struct restart_block *restart; 2179 struct futex_hash_bucket *hb; 2180 struct futex_q q = futex_q_init; 2181 int ret; 2182 2183 if (!bitset) 2184 return -EINVAL; 2185 q.bitset = bitset; 2186 2187 if (abs_time) { 2188 to = &timeout; 2189 2190 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2191 CLOCK_REALTIME : CLOCK_MONOTONIC, 2192 HRTIMER_MODE_ABS); 2193 hrtimer_init_sleeper(to, current); 2194 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2195 current->timer_slack_ns); 2196 } 2197 2198retry: 2199 /* 2200 * Prepare to wait on uaddr. On success, holds hb lock and increments 2201 * q.key refs. 2202 */ 2203 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2204 if (ret) 2205 goto out; 2206 2207 /* queue_me and wait for wakeup, timeout, or a signal. */ 2208 futex_wait_queue_me(hb, &q, to); 2209 2210 /* If we were woken (and unqueued), we succeeded, whatever. */ 2211 ret = 0; 2212 /* unqueue_me() drops q.key ref */ 2213 if (!unqueue_me(&q)) 2214 goto out; 2215 ret = -ETIMEDOUT; 2216 if (to && !to->task) 2217 goto out; 2218 2219 /* 2220 * We expect signal_pending(current), but we might be the 2221 * victim of a spurious wakeup as well. 2222 */ 2223 if (!signal_pending(current)) 2224 goto retry; 2225 2226 ret = -ERESTARTSYS; 2227 if (!abs_time) 2228 goto out; 2229 2230 restart = ¤t->restart_block; 2231 restart->fn = futex_wait_restart; 2232 restart->futex.uaddr = uaddr; 2233 restart->futex.val = val; 2234 restart->futex.time = abs_time->tv64; 2235 restart->futex.bitset = bitset; 2236 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 2237 2238 ret = -ERESTART_RESTARTBLOCK; 2239 2240out: 2241 if (to) { 2242 hrtimer_cancel(&to->timer); 2243 destroy_hrtimer_on_stack(&to->timer); 2244 } 2245 return ret; 2246} 2247 2248 2249static long futex_wait_restart(struct restart_block *restart) 2250{ 2251 u32 __user *uaddr = restart->futex.uaddr; 2252 ktime_t t, *tp = NULL; 2253 2254 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 2255 t.tv64 = restart->futex.time; 2256 tp = &t; 2257 } 2258 restart->fn = do_no_restart_syscall; 2259 2260 return (long)futex_wait(uaddr, restart->futex.flags, 2261 restart->futex.val, tp, restart->futex.bitset); 2262} 2263 2264 2265/* 2266 * Userspace tried a 0 -> TID atomic transition of the futex value 2267 * and failed. The kernel side here does the whole locking operation: 2268 * if there are waiters then it will block, it does PI, etc. (Due to 2269 * races the kernel might see a 0 value of the futex too.) 2270 */ 2271static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2272 ktime_t *time, int trylock) 2273{ 2274 struct hrtimer_sleeper timeout, *to = NULL; 2275 struct futex_hash_bucket *hb; 2276 struct futex_q q = futex_q_init; 2277 int res, ret; 2278 2279 if (refill_pi_state_cache()) 2280 return -ENOMEM; 2281 2282 if (time) { 2283 to = &timeout; 2284 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 2285 HRTIMER_MODE_ABS); 2286 hrtimer_init_sleeper(to, current); 2287 hrtimer_set_expires(&to->timer, *time); 2288 } 2289 2290retry: 2291 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); 2292 if (unlikely(ret != 0)) 2293 goto out; 2294 2295retry_private: 2296 hb = queue_lock(&q); 2297 2298 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); 2299 if (unlikely(ret)) { 2300 switch (ret) { 2301 case 1: 2302 /* We got the lock. */ 2303 ret = 0; 2304 goto out_unlock_put_key; 2305 case -EFAULT: 2306 goto uaddr_faulted; 2307 case -EAGAIN: 2308 /* 2309 * Two reasons for this: 2310 * - Task is exiting and we just wait for the 2311 * exit to complete. 2312 * - The user space value changed. 2313 */ 2314 queue_unlock(hb); 2315 put_futex_key(&q.key); 2316 cond_resched(); 2317 goto retry; 2318 default: 2319 goto out_unlock_put_key; 2320 } 2321 } 2322 2323 /* 2324 * Only actually queue now that the atomic ops are done: 2325 */ 2326 queue_me(&q, hb); 2327 2328 WARN_ON(!q.pi_state); 2329 /* 2330 * Block on the PI mutex: 2331 */ 2332 if (!trylock) { 2333 ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); 2334 } else { 2335 ret = rt_mutex_trylock(&q.pi_state->pi_mutex); 2336 /* Fixup the trylock return value: */ 2337 ret = ret ? 0 : -EWOULDBLOCK; 2338 } 2339 2340 spin_lock(q.lock_ptr); 2341 /* 2342 * Fixup the pi_state owner and possibly acquire the lock if we 2343 * haven't already. 2344 */ 2345 res = fixup_owner(uaddr, &q, !ret); 2346 /* 2347 * If fixup_owner() returned an error, proprogate that. If it acquired 2348 * the lock, clear our -ETIMEDOUT or -EINTR. 2349 */ 2350 if (res) 2351 ret = (res < 0) ? res : 0; 2352 2353 /* 2354 * If fixup_owner() faulted and was unable to handle the fault, unlock 2355 * it and return the fault to userspace. 2356 */ 2357 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 2358 rt_mutex_unlock(&q.pi_state->pi_mutex); 2359 2360 /* Unqueue and drop the lock */ 2361 unqueue_me_pi(&q); 2362 2363 goto out_put_key; 2364 2365out_unlock_put_key: 2366 queue_unlock(hb); 2367 2368out_put_key: 2369 put_futex_key(&q.key); 2370out: 2371 if (to) 2372 destroy_hrtimer_on_stack(&to->timer); 2373 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2374 2375uaddr_faulted: 2376 queue_unlock(hb); 2377 2378 ret = fault_in_user_writeable(uaddr); 2379 if (ret) 2380 goto out_put_key; 2381 2382 if (!(flags & FLAGS_SHARED)) 2383 goto retry_private; 2384 2385 put_futex_key(&q.key); 2386 goto retry; 2387} 2388 2389/* 2390 * Userspace attempted a TID -> 0 atomic transition, and failed. 2391 * This is the in-kernel slowpath: we look up the PI state (if any), 2392 * and do the rt-mutex unlock. 2393 */ 2394static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2395{ 2396 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); 2397 union futex_key key = FUTEX_KEY_INIT; 2398 struct futex_hash_bucket *hb; 2399 struct futex_q *match; 2400 int ret; 2401 2402retry: 2403 if (get_user(uval, uaddr)) 2404 return -EFAULT; 2405 /* 2406 * We release only a lock we actually own: 2407 */ 2408 if ((uval & FUTEX_TID_MASK) != vpid) 2409 return -EPERM; 2410 2411 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2412 if (ret) 2413 return ret; 2414 2415 hb = hash_futex(&key); 2416 spin_lock(&hb->lock); 2417 2418 /* 2419 * Check waiters first. We do not trust user space values at 2420 * all and we at least want to know if user space fiddled 2421 * with the futex value instead of blindly unlocking. 2422 */ 2423 match = futex_top_waiter(hb, &key); 2424 if (match) { 2425 ret = wake_futex_pi(uaddr, uval, match); 2426 /* 2427 * The atomic access to the futex value generated a 2428 * pagefault, so retry the user-access and the wakeup: 2429 */ 2430 if (ret == -EFAULT) 2431 goto pi_faulted; 2432 /* 2433 * A unconditional UNLOCK_PI op raced against a waiter 2434 * setting the FUTEX_WAITERS bit. Try again. 2435 */ 2436 if (ret == -EAGAIN) { 2437 spin_unlock(&hb->lock); 2438 put_futex_key(&key); 2439 goto retry; 2440 } 2441 goto out_unlock; 2442 } 2443 2444 /* 2445 * We have no kernel internal state, i.e. no waiters in the 2446 * kernel. Waiters which are about to queue themselves are stuck 2447 * on hb->lock. So we can safely ignore them. We do neither 2448 * preserve the WAITERS bit not the OWNER_DIED one. We are the 2449 * owner. 2450 */ 2451 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) 2452 goto pi_faulted; 2453 2454 /* 2455 * If uval has changed, let user space handle it. 2456 */ 2457 ret = (curval == uval) ? 0 : -EAGAIN; 2458 2459out_unlock: 2460 spin_unlock(&hb->lock); 2461 put_futex_key(&key); 2462 return ret; 2463 2464pi_faulted: 2465 spin_unlock(&hb->lock); 2466 put_futex_key(&key); 2467 2468 ret = fault_in_user_writeable(uaddr); 2469 if (!ret) 2470 goto retry; 2471 2472 return ret; 2473} 2474 2475/** 2476 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex 2477 * @hb: the hash_bucket futex_q was original enqueued on 2478 * @q: the futex_q woken while waiting to be requeued 2479 * @key2: the futex_key of the requeue target futex 2480 * @timeout: the timeout associated with the wait (NULL if none) 2481 * 2482 * Detect if the task was woken on the initial futex as opposed to the requeue 2483 * target futex. If so, determine if it was a timeout or a signal that caused 2484 * the wakeup and return the appropriate error code to the caller. Must be 2485 * called with the hb lock held. 2486 * 2487 * Return: 2488 * 0 = no early wakeup detected; 2489 * <0 = -ETIMEDOUT or -ERESTARTNOINTR 2490 */ 2491static inline 2492int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2493 struct futex_q *q, union futex_key *key2, 2494 struct hrtimer_sleeper *timeout) 2495{ 2496 int ret = 0; 2497 2498 /* 2499 * With the hb lock held, we avoid races while we process the wakeup. 2500 * We only need to hold hb (and not hb2) to ensure atomicity as the 2501 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. 2502 * It can't be requeued from uaddr2 to something else since we don't 2503 * support a PI aware source futex for requeue. 2504 */ 2505 if (!match_futex(&q->key, key2)) { 2506 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); 2507 /* 2508 * We were woken prior to requeue by a timeout or a signal. 2509 * Unqueue the futex_q and determine which it was. 2510 */ 2511 plist_del(&q->list, &hb->chain); 2512 hb_waiters_dec(hb); 2513 2514 /* Handle spurious wakeups gracefully */ 2515 ret = -EWOULDBLOCK; 2516 if (timeout && !timeout->task) 2517 ret = -ETIMEDOUT; 2518 else if (signal_pending(current)) 2519 ret = -ERESTARTNOINTR; 2520 } 2521 return ret; 2522} 2523 2524/** 2525 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2526 * @uaddr: the futex we initially wait on (non-pi) 2527 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 2528 * the same type, no requeueing from private to shared, etc. 2529 * @val: the expected value of uaddr 2530 * @abs_time: absolute timeout 2531 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2532 * @uaddr2: the pi futex we will take prior to returning to user-space 2533 * 2534 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2535 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 2536 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 2537 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 2538 * without one, the pi logic would not know which task to boost/deboost, if 2539 * there was a need to. 2540 * 2541 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2542 * via the following-- 2543 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2544 * 2) wakeup on uaddr2 after a requeue 2545 * 3) signal 2546 * 4) timeout 2547 * 2548 * If 3, cleanup and return -ERESTARTNOINTR. 2549 * 2550 * If 2, we may then block on trying to take the rt_mutex and return via: 2551 * 5) successful lock 2552 * 6) signal 2553 * 7) timeout 2554 * 8) other lock acquisition failure 2555 * 2556 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). 2557 * 2558 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2559 * 2560 * Return: 2561 * 0 - On success; 2562 * <0 - On error 2563 */ 2564static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2565 u32 val, ktime_t *abs_time, u32 bitset, 2566 u32 __user *uaddr2) 2567{ 2568 struct hrtimer_sleeper timeout, *to = NULL; 2569 struct rt_mutex_waiter rt_waiter; 2570 struct rt_mutex *pi_mutex = NULL; 2571 struct futex_hash_bucket *hb; 2572 union futex_key key2 = FUTEX_KEY_INIT; 2573 struct futex_q q = futex_q_init; 2574 int res, ret; 2575 2576 if (uaddr == uaddr2) 2577 return -EINVAL; 2578 2579 if (!bitset) 2580 return -EINVAL; 2581 2582 if (abs_time) { 2583 to = &timeout; 2584 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2585 CLOCK_REALTIME : CLOCK_MONOTONIC, 2586 HRTIMER_MODE_ABS); 2587 hrtimer_init_sleeper(to, current); 2588 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2589 current->timer_slack_ns); 2590 } 2591 2592 /* 2593 * The waiter is allocated on our stack, manipulated by the requeue 2594 * code while we sleep on uaddr. 2595 */ 2596 debug_rt_mutex_init_waiter(&rt_waiter); 2597 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); 2598 RB_CLEAR_NODE(&rt_waiter.tree_entry); 2599 rt_waiter.task = NULL; 2600 2601 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2602 if (unlikely(ret != 0)) 2603 goto out; 2604 2605 q.bitset = bitset; 2606 q.rt_waiter = &rt_waiter; 2607 q.requeue_pi_key = &key2; 2608 2609 /* 2610 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2611 * count. 2612 */ 2613 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2614 if (ret) 2615 goto out_key2; 2616 2617 /* 2618 * The check above which compares uaddrs is not sufficient for 2619 * shared futexes. We need to compare the keys: 2620 */ 2621 if (match_futex(&q.key, &key2)) { 2622 queue_unlock(hb); 2623 ret = -EINVAL; 2624 goto out_put_keys; 2625 } 2626 2627 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2628 futex_wait_queue_me(hb, &q, to); 2629 2630 spin_lock(&hb->lock); 2631 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); 2632 spin_unlock(&hb->lock); 2633 if (ret) 2634 goto out_put_keys; 2635 2636 /* 2637 * In order for us to be here, we know our q.key == key2, and since 2638 * we took the hb->lock above, we also know that futex_requeue() has 2639 * completed and we no longer have to concern ourselves with a wakeup 2640 * race with the atomic proxy lock acquisition by the requeue code. The 2641 * futex_requeue dropped our key1 reference and incremented our key2 2642 * reference count. 2643 */ 2644 2645 /* Check if the requeue code acquired the second futex for us. */ 2646 if (!q.rt_waiter) { 2647 /* 2648 * Got the lock. We might not be the anticipated owner if we 2649 * did a lock-steal - fix up the PI-state in that case. 2650 */ 2651 if (q.pi_state && (q.pi_state->owner != current)) { 2652 spin_lock(q.lock_ptr); 2653 ret = fixup_pi_state_owner(uaddr2, &q, current); 2654 /* 2655 * Drop the reference to the pi state which 2656 * the requeue_pi() code acquired for us. 2657 */ 2658 free_pi_state(q.pi_state); 2659 spin_unlock(q.lock_ptr); 2660 } 2661 } else { 2662 /* 2663 * We have been woken up by futex_unlock_pi(), a timeout, or a 2664 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2665 * the pi_state. 2666 */ 2667 WARN_ON(!q.pi_state); 2668 pi_mutex = &q.pi_state->pi_mutex; 2669 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); 2670 debug_rt_mutex_free_waiter(&rt_waiter); 2671 2672 spin_lock(q.lock_ptr); 2673 /* 2674 * Fixup the pi_state owner and possibly acquire the lock if we 2675 * haven't already. 2676 */ 2677 res = fixup_owner(uaddr2, &q, !ret); 2678 /* 2679 * If fixup_owner() returned an error, proprogate that. If it 2680 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2681 */ 2682 if (res) 2683 ret = (res < 0) ? res : 0; 2684 2685 /* Unqueue and drop the lock. */ 2686 unqueue_me_pi(&q); 2687 } 2688 2689 /* 2690 * If fixup_pi_state_owner() faulted and was unable to handle the 2691 * fault, unlock the rt_mutex and return the fault to userspace. 2692 */ 2693 if (ret == -EFAULT) { 2694 if (pi_mutex && rt_mutex_owner(pi_mutex) == current) 2695 rt_mutex_unlock(pi_mutex); 2696 } else if (ret == -EINTR) { 2697 /* 2698 * We've already been requeued, but cannot restart by calling 2699 * futex_lock_pi() directly. We could restart this syscall, but 2700 * it would detect that the user space "val" changed and return 2701 * -EWOULDBLOCK. Save the overhead of the restart and return 2702 * -EWOULDBLOCK directly. 2703 */ 2704 ret = -EWOULDBLOCK; 2705 } 2706 2707out_put_keys: 2708 put_futex_key(&q.key); 2709out_key2: 2710 put_futex_key(&key2); 2711 2712out: 2713 if (to) { 2714 hrtimer_cancel(&to->timer); 2715 destroy_hrtimer_on_stack(&to->timer); 2716 } 2717 return ret; 2718} 2719 2720/* 2721 * Support for robust futexes: the kernel cleans up held futexes at 2722 * thread exit time. 2723 * 2724 * Implementation: user-space maintains a per-thread list of locks it 2725 * is holding. Upon do_exit(), the kernel carefully walks this list, 2726 * and marks all locks that are owned by this thread with the 2727 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 2728 * always manipulated with the lock held, so the list is private and 2729 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 2730 * field, to allow the kernel to clean up if the thread dies after 2731 * acquiring the lock, but just before it could have added itself to 2732 * the list. There can only be one such pending lock. 2733 */ 2734 2735/** 2736 * sys_set_robust_list() - Set the robust-futex list head of a task 2737 * @head: pointer to the list-head 2738 * @len: length of the list-head, as userspace expects 2739 */ 2740SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2741 size_t, len) 2742{ 2743 if (!futex_cmpxchg_enabled) 2744 return -ENOSYS; 2745 /* 2746 * The kernel knows only one size for now: 2747 */ 2748 if (unlikely(len != sizeof(*head))) 2749 return -EINVAL; 2750 2751 current->robust_list = head; 2752 2753 return 0; 2754} 2755 2756/** 2757 * sys_get_robust_list() - Get the robust-futex list head of a task 2758 * @pid: pid of the process [zero for current task] 2759 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2760 * @len_ptr: pointer to a length field, the kernel fills in the header size 2761 */ 2762SYSCALL_DEFINE3(get_robust_list, int, pid, 2763 struct robust_list_head __user * __user *, head_ptr, 2764 size_t __user *, len_ptr) 2765{ 2766 struct robust_list_head __user *head; 2767 unsigned long ret; 2768 struct task_struct *p; 2769 2770 if (!futex_cmpxchg_enabled) 2771 return -ENOSYS; 2772 2773 rcu_read_lock(); 2774 2775 ret = -ESRCH; 2776 if (!pid) 2777 p = current; 2778 else { 2779 p = find_task_by_vpid(pid); 2780 if (!p) 2781 goto err_unlock; 2782 } 2783 2784 ret = -EPERM; 2785 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) 2786 goto err_unlock; 2787 2788 head = p->robust_list; 2789 rcu_read_unlock(); 2790 2791 if (put_user(sizeof(*head), len_ptr)) 2792 return -EFAULT; 2793 return put_user(head, head_ptr); 2794 2795err_unlock: 2796 rcu_read_unlock(); 2797 2798 return ret; 2799} 2800 2801/* 2802 * Process a futex-list entry, check whether it's owned by the 2803 * dying task, and do notification if so: 2804 */ 2805int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2806{ 2807 u32 uval, uninitialized_var(nval), mval; 2808 2809retry: 2810 if (get_user(uval, uaddr)) 2811 return -1; 2812 2813 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 2814 /* 2815 * Ok, this dying thread is truly holding a futex 2816 * of interest. Set the OWNER_DIED bit atomically 2817 * via cmpxchg, and if the value had FUTEX_WAITERS 2818 * set, wake up a waiter (if any). (We have to do a 2819 * futex_wake() even if OWNER_DIED is already set - 2820 * to handle the rare but possible case of recursive 2821 * thread-death.) The rest of the cleanup is done in 2822 * userspace. 2823 */ 2824 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2825 /* 2826 * We are not holding a lock here, but we want to have 2827 * the pagefault_disable/enable() protection because 2828 * we want to handle the fault gracefully. If the 2829 * access fails we try to fault in the futex with R/W 2830 * verification via get_user_pages. get_user() above 2831 * does not guarantee R/W access. If that fails we 2832 * give up and leave the futex locked. 2833 */ 2834 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { 2835 if (fault_in_user_writeable(uaddr)) 2836 return -1; 2837 goto retry; 2838 } 2839 if (nval != uval) 2840 goto retry; 2841 2842 /* 2843 * Wake robust non-PI futexes here. The wakeup of 2844 * PI futexes happens in exit_pi_state(): 2845 */ 2846 if (!pi && (uval & FUTEX_WAITERS)) 2847 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 2848 } 2849 return 0; 2850} 2851 2852/* 2853 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 2854 */ 2855static inline int fetch_robust_entry(struct robust_list __user **entry, 2856 struct robust_list __user * __user *head, 2857 unsigned int *pi) 2858{ 2859 unsigned long uentry; 2860 2861 if (get_user(uentry, (unsigned long __user *)head)) 2862 return -EFAULT; 2863 2864 *entry = (void __user *)(uentry & ~1UL); 2865 *pi = uentry & 1; 2866 2867 return 0; 2868} 2869 2870/* 2871 * Walk curr->robust_list (very carefully, it's a userspace list!) 2872 * and mark any locks found there dead, and notify any waiters. 2873 * 2874 * We silently return on any sign of list-walking problem. 2875 */ 2876void exit_robust_list(struct task_struct *curr) 2877{ 2878 struct robust_list_head __user *head = curr->robust_list; 2879 struct robust_list __user *entry, *next_entry, *pending; 2880 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 2881 unsigned int uninitialized_var(next_pi); 2882 unsigned long futex_offset; 2883 int rc; 2884 2885 if (!futex_cmpxchg_enabled) 2886 return; 2887 2888 /* 2889 * Fetch the list head (which was registered earlier, via 2890 * sys_set_robust_list()): 2891 */ 2892 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 2893 return; 2894 /* 2895 * Fetch the relative futex offset: 2896 */ 2897 if (get_user(futex_offset, &head->futex_offset)) 2898 return; 2899 /* 2900 * Fetch any possibly pending lock-add first, and handle it 2901 * if it exists: 2902 */ 2903 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 2904 return; 2905 2906 next_entry = NULL; /* avoid warning with gcc */ 2907 while (entry != &head->list) { 2908 /* 2909 * Fetch the next entry in the list before calling 2910 * handle_futex_death: 2911 */ 2912 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 2913 /* 2914 * A pending lock might already be on the list, so 2915 * don't process it twice: 2916 */ 2917 if (entry != pending) 2918 if (handle_futex_death((void __user *)entry + futex_offset, 2919 curr, pi)) 2920 return; 2921 if (rc) 2922 return; 2923 entry = next_entry; 2924 pi = next_pi; 2925 /* 2926 * Avoid excessively long or circular lists: 2927 */ 2928 if (!--limit) 2929 break; 2930 2931 cond_resched(); 2932 } 2933 2934 if (pending) 2935 handle_futex_death((void __user *)pending + futex_offset, 2936 curr, pip); 2937} 2938 2939long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2940 u32 __user *uaddr2, u32 val2, u32 val3) 2941{ 2942 int cmd = op & FUTEX_CMD_MASK; 2943 unsigned int flags = 0; 2944 2945 if (!(op & FUTEX_PRIVATE_FLAG)) 2946 flags |= FLAGS_SHARED; 2947 2948 if (op & FUTEX_CLOCK_REALTIME) { 2949 flags |= FLAGS_CLOCKRT; 2950 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2951 return -ENOSYS; 2952 } 2953 2954 switch (cmd) { 2955 case FUTEX_LOCK_PI: 2956 case FUTEX_UNLOCK_PI: 2957 case FUTEX_TRYLOCK_PI: 2958 case FUTEX_WAIT_REQUEUE_PI: 2959 case FUTEX_CMP_REQUEUE_PI: 2960 if (!futex_cmpxchg_enabled) 2961 return -ENOSYS; 2962 } 2963 2964 switch (cmd) { 2965 case FUTEX_WAIT: 2966 val3 = FUTEX_BITSET_MATCH_ANY; 2967 case FUTEX_WAIT_BITSET: 2968 return futex_wait(uaddr, flags, val, timeout, val3); 2969 case FUTEX_WAKE: 2970 val3 = FUTEX_BITSET_MATCH_ANY; 2971 case FUTEX_WAKE_BITSET: 2972 return futex_wake(uaddr, flags, val, val3); 2973 case FUTEX_REQUEUE: 2974 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2975 case FUTEX_CMP_REQUEUE: 2976 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2977 case FUTEX_WAKE_OP: 2978 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2979 case FUTEX_LOCK_PI: 2980 return futex_lock_pi(uaddr, flags, timeout, 0); 2981 case FUTEX_UNLOCK_PI: 2982 return futex_unlock_pi(uaddr, flags); 2983 case FUTEX_TRYLOCK_PI: 2984 return futex_lock_pi(uaddr, flags, NULL, 1); 2985 case FUTEX_WAIT_REQUEUE_PI: 2986 val3 = FUTEX_BITSET_MATCH_ANY; 2987 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2988 uaddr2); 2989 case FUTEX_CMP_REQUEUE_PI: 2990 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2991 } 2992 return -ENOSYS; 2993} 2994 2995 2996SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, 2997 struct timespec __user *, utime, u32 __user *, uaddr2, 2998 u32, val3) 2999{ 3000 struct timespec ts; 3001 ktime_t t, *tp = NULL; 3002 u32 val2 = 0; 3003 int cmd = op & FUTEX_CMD_MASK; 3004 3005 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3006 cmd == FUTEX_WAIT_BITSET || 3007 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3008 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 3009 return -EFAULT; 3010 if (!timespec_valid(&ts)) 3011 return -EINVAL; 3012 3013 t = timespec_to_ktime(ts); 3014 if (cmd == FUTEX_WAIT) 3015 t = ktime_add_safe(ktime_get(), t); 3016 tp = &t; 3017 } 3018 /* 3019 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. 3020 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 3021 */ 3022 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 3023 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 3024 val2 = (u32) (unsigned long) utime; 3025 3026 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 3027} 3028 3029static void __init futex_detect_cmpxchg(void) 3030{ 3031#ifndef CONFIG_HAVE_FUTEX_CMPXCHG 3032 u32 curval; 3033 3034 /* 3035 * This will fail and we want it. Some arch implementations do 3036 * runtime detection of the futex_atomic_cmpxchg_inatomic() 3037 * functionality. We want to know that before we call in any 3038 * of the complex code paths. Also we want to prevent 3039 * registration of robust lists in that case. NULL is 3040 * guaranteed to fault and we get -EFAULT on functional 3041 * implementation, the non-functional ones will return 3042 * -ENOSYS. 3043 */ 3044 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 3045 futex_cmpxchg_enabled = 1; 3046#endif 3047} 3048 3049static int __init futex_init(void) 3050{ 3051 unsigned int futex_shift; 3052 unsigned long i; 3053 3054#if CONFIG_BASE_SMALL 3055 futex_hashsize = 16; 3056#else 3057 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 3058#endif 3059 3060 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 3061 futex_hashsize, 0, 3062 futex_hashsize < 256 ? HASH_SMALL : 0, 3063 &futex_shift, NULL, 3064 futex_hashsize, futex_hashsize); 3065 futex_hashsize = 1UL << futex_shift; 3066 3067 futex_detect_cmpxchg(); 3068 3069 for (i = 0; i < futex_hashsize; i++) { 3070 atomic_set(&futex_queues[i].waiters, 0); 3071 plist_head_init(&futex_queues[i].chain); 3072 spin_lock_init(&futex_queues[i].lock); 3073 } 3074 3075 return 0; 3076} 3077__initcall(futex_init); 3078