futex.c 67.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
11 12 13 14
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
15 16 17 18
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
Eric Dumazet's avatar
Eric Dumazet committed
19 20 21
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
22 23 24 25
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
57
#include <linux/signal.h>
58
#include <linux/module.h>
59
#include <linux/magic.h>
60 61 62
#include <linux/pid.h>
#include <linux/nsproxy.h>

63
#include <asm/futex.h>
Linus Torvalds's avatar
Linus Torvalds committed
64

65 66
#include "rtmutex_common.h"

67 68
int __read_mostly futex_cmpxchg_enabled;

Linus Torvalds's avatar
Linus Torvalds committed
69 70
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
	/*
	 * list of 'owned' pi_state instances - these have to be
	 * cleaned up in do_exit() if the task exits prematurely:
	 */
	struct list_head list;

	/*
	 * The PI object:
	 */
	struct rt_mutex pi_mutex;

	struct task_struct *owner;
	atomic_t refcount;

	union futex_key key;
};

92 93 94 95 96 97 98 99 100 101 102
/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
 * @task:		the task waiting on the futex
 * @lock_ptr:		the hash bucket lock
 * @key:		the key the futex is hashed on
 * @pi_state:		optional priority inheritance state
 * @rt_waiter:		rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:	the requeue_pi target futex key
 * @bitset:		bitset for the optional bitmasked wakeup
 *
 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
Linus Torvalds's avatar
Linus Torvalds committed
103 104 105
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
Pierre Peiffer's avatar
Pierre Peiffer committed
106
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
Linus Torvalds's avatar
Linus Torvalds committed
107
 * The order of wakup is always to make the first condition true, then
108 109 110 111
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See unqueue_me_pi().
Linus Torvalds's avatar
Linus Torvalds committed
112 113
 */
struct futex_q {
Pierre Peiffer's avatar
Pierre Peiffer committed
114
	struct plist_node list;
Linus Torvalds's avatar
Linus Torvalds committed
115

116
	struct task_struct *task;
Linus Torvalds's avatar
Linus Torvalds committed
117 118
	spinlock_t *lock_ptr;
	union futex_key key;
119
	struct futex_pi_state *pi_state;
120
	struct rt_mutex_waiter *rt_waiter;
121
	union futex_key *requeue_pi_key;
122
	u32 bitset;
Linus Torvalds's avatar
Linus Torvalds committed
123 124 125
};

/*
Darren Hart's avatar
Darren Hart committed
126 127 128
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
Linus Torvalds's avatar
Linus Torvalds committed
129 130
 */
struct futex_hash_bucket {
Pierre Peiffer's avatar
Pierre Peiffer committed
131 132
	spinlock_t lock;
	struct plist_head chain;
Linus Torvalds's avatar
Linus Torvalds committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
};

static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];

/*
 * We hash on the keys returned from get_futex_key (see below).
 */
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
	u32 hash = jhash2((u32*)&key->both.word,
			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
			  key->both.offset);
	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
}

/*
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
153 154
	return (key1 && key2
		&& key1->both.word == key2->both.word
Linus Torvalds's avatar
Linus Torvalds committed
155 156 157 158
		&& key1->both.ptr == key2->both.ptr
		&& key1->both.offset == key2->both.offset);
}

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
/*
 * Take a reference to the resource addressed by a key.
 * Can be called while holding spinlocks.
 *
 */
static void get_futex_key_refs(union futex_key *key)
{
	if (!key->both.ptr)
		return;

	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
		atomic_inc(&key->shared.inode->i_count);
		break;
	case FUT_OFF_MMSHARED:
		atomic_inc(&key->private.mm->mm_count);
		break;
	}
}

/*
 * Drop a reference to the resource addressed by a key.
 * The hash bucket spinlock must not be held.
 */
static void drop_futex_key_refs(union futex_key *key)
{
185 186 187
	if (!key->both.ptr) {
		/* If we're here then we tried to put a key we failed to get */
		WARN_ON_ONCE(1);
188
		return;
189
	}
190 191 192 193 194 195 196 197 198 199 200

	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
		iput(key->shared.inode);
		break;
	case FUT_OFF_MMSHARED:
		mmdrop(key->private.mm);
		break;
	}
}

Eric Dumazet's avatar
Eric Dumazet committed
201
/**
202 203 204 205 206 207
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:	virtual address of the futex
 * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:	address where result is stored.
 * @rw:		mapping needs to be read/write (values: VERIFY_READ,
 * 		VERIFY_WRITE)
Eric Dumazet's avatar
Eric Dumazet committed
208 209 210
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
Linus Torvalds's avatar
Linus Torvalds committed
211
 *
212
 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
Darren Hart's avatar
Darren Hart committed
216
 * lock_page() might sleep, the caller should not hold a spinlock.
Linus Torvalds's avatar
Linus Torvalds committed
217
 */
218 219
static int
get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
Linus Torvalds's avatar
Linus Torvalds committed
220
{
221
	unsigned long address = (unsigned long)uaddr;
Linus Torvalds's avatar
Linus Torvalds committed
222 223 224 225 226 227 228
	struct mm_struct *mm = current->mm;
	struct page *page;
	int err;

	/*
	 * The futex address must be "naturally" aligned.
	 */
229
	key->both.offset = address % PAGE_SIZE;
Eric Dumazet's avatar
Eric Dumazet committed
230
	if (unlikely((address % sizeof(u32)) != 0))
Linus Torvalds's avatar
Linus Torvalds committed
231
		return -EINVAL;
232
	address -= key->both.offset;
Linus Torvalds's avatar
Linus Torvalds committed
233

Eric Dumazet's avatar
Eric Dumazet committed
234 235 236 237 238 239 240 241
	/*
	 * PROCESS_PRIVATE futexes are fast.
	 * As the mm cannot disappear under us and the 'key' only needs
	 * virtual address, we dont even have to find the underlying vma.
	 * Note : We do have to check 'uaddr' is a valid user address,
	 *        but access_ok() should be faster than find_vma()
	 */
	if (!fshared) {
242
		if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
Eric Dumazet's avatar
Eric Dumazet committed
243 244 245
			return -EFAULT;
		key->private.mm = mm;
		key->private.address = address;
246
		get_futex_key_refs(key);
Eric Dumazet's avatar
Eric Dumazet committed
247 248
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
249

250
again:
251
	err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
252 253 254
	if (err < 0)
		return err;

255
	page = compound_head(page);
256 257 258 259 260 261
	lock_page(page);
	if (!page->mapping) {
		unlock_page(page);
		put_page(page);
		goto again;
	}
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264 265 266 267

	/*
	 * Private mappings are handled in a simple way.
	 *
	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
	 * it's a read-only handle, it's expected that futexes attach to
268
	 * the object not the particular process.
Linus Torvalds's avatar
Linus Torvalds committed
269
	 */
270 271
	if (PageAnon(page)) {
		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
Linus Torvalds's avatar
Linus Torvalds committed
272
		key->private.mm = mm;
273
		key->private.address = address;
274 275 276 277
	} else {
		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
		key->shared.inode = page->mapping->host;
		key->shared.pgoff = page->index;
Linus Torvalds's avatar
Linus Torvalds committed
278 279
	}

280
	get_futex_key_refs(key);
Linus Torvalds's avatar
Linus Torvalds committed
281

282 283 284
	unlock_page(page);
	put_page(page);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
285 286
}

287
static inline
Peter Zijlstra's avatar
Peter Zijlstra committed
288
void put_futex_key(int fshared, union futex_key *key)
Linus Torvalds's avatar
Linus Torvalds committed
289
{
290
	drop_futex_key_refs(key);
Linus Torvalds's avatar
Linus Torvalds committed
291 292
}

293 294
/**
 * fault_in_user_writeable() - Fault in user address and verify RW access
295 296 297 298 299 300 301 302 303 304 305 306
 * @uaddr:	pointer to faulting user space address
 *
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
 * We have no generic implementation of a non destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
 */
static int fault_in_user_writeable(u32 __user *uaddr)
{
307 308 309 310 311 312 313 314
	struct mm_struct *mm = current->mm;
	int ret;

	down_read(&mm->mmap_sem);
	ret = get_user_pages(current, mm, (unsigned long)uaddr,
			     1, 1, 0, NULL, NULL);
	up_read(&mm->mmap_sem);

315 316 317
	return ret < 0 ? ret : 0;
}

318 319
/**
 * futex_top_waiter() - Return the highest priority waiter on a futex
320 321
 * @hb:		the hash bucket the futex_q's reside in
 * @key:	the futex key (to distinguish it from other futex futex_q's)
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
 *
 * Must be called with the hb lock held.
 */
static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
					union futex_key *key)
{
	struct futex_q *this;

	plist_for_each_entry(this, &hb->chain, list) {
		if (match_futex(&this->key, key))
			return this;
	}
	return NULL;
}

Thomas Gleixner's avatar
Thomas Gleixner committed
337 338 339 340 341 342 343 344 345 346 347 348
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
{
	u32 curval;

	pagefault_disable();
	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
	pagefault_enable();

	return curval;
}

static int get_futex_value_locked(u32 *dest, u32 __user *from)
Linus Torvalds's avatar
Linus Torvalds committed
349 350 351
{
	int ret;

352
	pagefault_disable();
353
	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
354
	pagefault_enable();
Linus Torvalds's avatar
Linus Torvalds committed
355 356 357 358

	return ret ? -EFAULT : 0;
}

359 360 361 362 363 364 365 366 367 368 369

/*
 * PI code:
 */
static int refill_pi_state_cache(void)
{
	struct futex_pi_state *pi_state;

	if (likely(current->pi_state_cache))
		return 0;

370
	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
371 372 373 374 375 376 377 378

	if (!pi_state)
		return -ENOMEM;

	INIT_LIST_HEAD(&pi_state->list);
	/* pi_mutex gets initialized later */
	pi_state->owner = NULL;
	atomic_set(&pi_state->refcount, 1);
379
	pi_state->key = FUTEX_KEY_INIT;
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433

	current->pi_state_cache = pi_state;

	return 0;
}

static struct futex_pi_state * alloc_pi_state(void)
{
	struct futex_pi_state *pi_state = current->pi_state_cache;

	WARN_ON(!pi_state);
	current->pi_state_cache = NULL;

	return pi_state;
}

static void free_pi_state(struct futex_pi_state *pi_state)
{
	if (!atomic_dec_and_test(&pi_state->refcount))
		return;

	/*
	 * If pi_state->owner is NULL, the owner is most probably dying
	 * and has cleaned up the pi_state already
	 */
	if (pi_state->owner) {
		spin_lock_irq(&pi_state->owner->pi_lock);
		list_del_init(&pi_state->list);
		spin_unlock_irq(&pi_state->owner->pi_lock);

		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
	}

	if (current->pi_state_cache)
		kfree(pi_state);
	else {
		/*
		 * pi_state->list is already empty.
		 * clear pi_state->owner.
		 * refcount is at 0 - put it back to 1.
		 */
		pi_state->owner = NULL;
		atomic_set(&pi_state->refcount, 1);
		current->pi_state_cache = pi_state;
	}
}

/*
 * Look up the task based on what TID userspace gave us.
 * We dont trust it.
 */
static struct task_struct * futex_find_get_task(pid_t pid)
{
	struct task_struct *p;
434
	const struct cred *cred = current_cred(), *pcred;
435

436
	rcu_read_lock();
437
	p = find_task_by_vpid(pid);
438
	if (!p) {
439
		p = ERR_PTR(-ESRCH);
440 441 442 443 444 445 446 447
	} else {
		pcred = __task_cred(p);
		if (cred->euid != pcred->euid &&
		    cred->euid != pcred->uid)
			p = ERR_PTR(-ESRCH);
		else
			get_task_struct(p);
	}
448

449
	rcu_read_unlock();
450 451 452 453 454 455 456 457 458 459 460 461 462

	return p;
}

/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
void exit_pi_state_list(struct task_struct *curr)
{
	struct list_head *next, *head = &curr->pi_state_list;
	struct futex_pi_state *pi_state;
463
	struct futex_hash_bucket *hb;
464
	union futex_key key = FUTEX_KEY_INIT;
465

466 467
	if (!futex_cmpxchg_enabled)
		return;
468 469 470
	/*
	 * We are a ZOMBIE and nobody can enqueue itself on
	 * pi_state_list anymore, but we have to be careful
471
	 * versus waiters unqueueing themselves:
472 473 474 475 476 477 478
	 */
	spin_lock_irq(&curr->pi_lock);
	while (!list_empty(head)) {

		next = head->next;
		pi_state = list_entry(next, struct futex_pi_state, list);
		key = pi_state->key;
479
		hb = hash_futex(&key);
480 481 482 483 484
		spin_unlock_irq(&curr->pi_lock);

		spin_lock(&hb->lock);

		spin_lock_irq(&curr->pi_lock);
485 486 487 488
		/*
		 * We dropped the pi-lock, so re-check whether this
		 * task still owns the PI-state:
		 */
489 490 491 492 493 494
		if (head->next != next) {
			spin_unlock(&hb->lock);
			continue;
		}

		WARN_ON(pi_state->owner != curr);
495 496
		WARN_ON(list_empty(&pi_state->list));
		list_del_init(&pi_state->list);
497 498 499 500 501 502 503 504 505 506 507 508 509
		pi_state->owner = NULL;
		spin_unlock_irq(&curr->pi_lock);

		rt_mutex_unlock(&pi_state->pi_mutex);

		spin_unlock(&hb->lock);

		spin_lock_irq(&curr->pi_lock);
	}
	spin_unlock_irq(&curr->pi_lock);
}

static int
Pierre Peiffer's avatar
Pierre Peiffer committed
510 511
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
		union futex_key *key, struct futex_pi_state **ps)
512 513 514
{
	struct futex_pi_state *pi_state = NULL;
	struct futex_q *this, *next;
Pierre Peiffer's avatar
Pierre Peiffer committed
515
	struct plist_head *head;
516
	struct task_struct *p;
517
	pid_t pid = uval & FUTEX_TID_MASK;
518 519 520

	head = &hb->chain;

Pierre Peiffer's avatar
Pierre Peiffer committed
521
	plist_for_each_entry_safe(this, next, head, list) {
Pierre Peiffer's avatar
Pierre Peiffer committed
522
		if (match_futex(&this->key, key)) {
523 524 525 526 527
			/*
			 * Another waiter already exists - bump up
			 * the refcount and return its pi_state:
			 */
			pi_state = this->pi_state;
528 529 530 531 532 533
			/*
			 * Userspace might have messed up non PI and PI futexes
			 */
			if (unlikely(!pi_state))
				return -EINVAL;

534
			WARN_ON(!atomic_read(&pi_state->refcount));
535 536
			WARN_ON(pid && pi_state->owner &&
				pi_state->owner->pid != pid);
537

538
			atomic_inc(&pi_state->refcount);
Pierre Peiffer's avatar
Pierre Peiffer committed
539
			*ps = pi_state;
540 541 542 543 544 545

			return 0;
		}
	}

	/*
546
	 * We are the first waiter - try to look up the real owner and attach
547
	 * the new pi_state to it, but bail out when TID = 0
548
	 */
549
	if (!pid)
550
		return -ESRCH;
551
	p = futex_find_get_task(pid);
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
	if (IS_ERR(p))
		return PTR_ERR(p);

	/*
	 * We need to look at the task state flags to figure out,
	 * whether the task is exiting. To protect against the do_exit
	 * change of the task flags, we do this protected by
	 * p->pi_lock:
	 */
	spin_lock_irq(&p->pi_lock);
	if (unlikely(p->flags & PF_EXITING)) {
		/*
		 * The task is on the way out. When PF_EXITPIDONE is
		 * set, we know that the task has finished the
		 * cleanup:
		 */
		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;

		spin_unlock_irq(&p->pi_lock);
		put_task_struct(p);
		return ret;
	}
574 575 576 577 578 579 580 581 582 583

	pi_state = alloc_pi_state();

	/*
	 * Initialize the pi_mutex in locked state and make 'p'
	 * the owner of it:
	 */
	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);

	/* Store the key for possible exit cleanups: */
Pierre Peiffer's avatar
Pierre Peiffer committed
584
	pi_state->key = *key;
585

586
	WARN_ON(!list_empty(&pi_state->list));
587 588 589 590 591 592
	list_add(&pi_state->list, &p->pi_state_list);
	pi_state->owner = p;
	spin_unlock_irq(&p->pi_lock);

	put_task_struct(p);

Pierre Peiffer's avatar
Pierre Peiffer committed
593
	*ps = pi_state;
594 595 596 597

	return 0;
}

598
/**
599
 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
600 601 602 603 604 605 606 607
 * @uaddr:		the pi futex user address
 * @hb:			the pi futex hash bucket
 * @key:		the futex key associated with uaddr and hb
 * @ps:			the pi_state pointer where we store the result of the
 *			lookup
 * @task:		the task to perform the atomic lock work for.  This will
 *			be "current" except in the case of requeue pi.
 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
608 609 610 611 612 613 614 615 616 617 618
 *
 * Returns:
 *  0 - ready to wait
 *  1 - acquired the lock
 * <0 - error
 *
 * The hb->lock and futex_key refs shall be held by the caller.
 */
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
				union futex_key *key,
				struct futex_pi_state **ps,
619
				struct task_struct *task, int set_waiters)
620 621 622 623 624 625 626 627 628 629 630 631 632
{
	int lock_taken, ret, ownerdied = 0;
	u32 uval, newval, curval;

retry:
	ret = lock_taken = 0;

	/*
	 * To avoid races, we attempt to take the lock here again
	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
	 * the locks. It will most likely not succeed.
	 */
	newval = task_pid_vnr(task);
633 634
	if (set_waiters)
		newval |= FUTEX_WAITERS;
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722

	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);

	if (unlikely(curval == -EFAULT))
		return -EFAULT;

	/*
	 * Detect deadlocks.
	 */
	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
		return -EDEADLK;

	/*
	 * Surprise - we got the lock. Just return to userspace:
	 */
	if (unlikely(!curval))
		return 1;

	uval = curval;

	/*
	 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
	 * to wake at the next unlock.
	 */
	newval = curval | FUTEX_WAITERS;

	/*
	 * There are two cases, where a futex might have no owner (the
	 * owner TID is 0): OWNER_DIED. We take over the futex in this
	 * case. We also do an unconditional take over, when the owner
	 * of the futex died.
	 *
	 * This is safe as we are protected by the hash bucket lock !
	 */
	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
		/* Keep the OWNER_DIED bit */
		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
		ownerdied = 0;
		lock_taken = 1;
	}

	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);

	if (unlikely(curval == -EFAULT))
		return -EFAULT;
	if (unlikely(curval != uval))
		goto retry;

	/*
	 * We took the lock due to owner died take over.
	 */
	if (unlikely(lock_taken))
		return 1;

	/*
	 * We dont have the lock. Look up the PI state (or create it if
	 * we are the first waiter):
	 */
	ret = lookup_pi_state(uval, hb, key, ps);

	if (unlikely(ret)) {
		switch (ret) {
		case -ESRCH:
			/*
			 * No owner found for this futex. Check if the
			 * OWNER_DIED bit is set to figure out whether
			 * this is a robust futex or not.
			 */
			if (get_futex_value_locked(&curval, uaddr))
				return -EFAULT;

			/*
			 * We simply start over in case of a robust
			 * futex. The code above will take the futex
			 * and return happy.
			 */
			if (curval & FUTEX_OWNER_DIED) {
				ownerdied = 1;
				goto retry;
			}
		default:
			break;
		}
	}

	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
723 724 725 726 727 728
/*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed.
 */
static void wake_futex(struct futex_q *q)
{
Thomas Gleixner's avatar
Thomas Gleixner committed
729 730
	struct task_struct *p = q->task;

Linus Torvalds's avatar
Linus Torvalds committed
731
	/*
Thomas Gleixner's avatar
Thomas Gleixner committed
732 733 734 735 736
	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
	 * a non futex wake up happens on another CPU then the task
	 * might exit and p would dereference a non existing task
	 * struct. Prevent this by holding a reference on p across the
	 * wake up.
Linus Torvalds's avatar
Linus Torvalds committed
737
	 */
Thomas Gleixner's avatar
Thomas Gleixner committed
738 739 740
	get_task_struct(p);

	plist_del(&q->list, &q->list.plist);
Linus Torvalds's avatar
Linus Torvalds committed
741
	/*
Thomas Gleixner's avatar
Thomas Gleixner committed
742 743 744 745
	 * The waiting task can free the futex_q as soon as
	 * q->lock_ptr = NULL is written, without taking any locks. A
	 * memory barrier is required here to prevent the following
	 * store to lock_ptr from getting ahead of the plist_del.
Linus Torvalds's avatar
Linus Torvalds committed
746
	 */
747
	smp_wmb();
Linus Torvalds's avatar
Linus Torvalds committed
748
	q->lock_ptr = NULL;
Thomas Gleixner's avatar
Thomas Gleixner committed
749 750 751

	wake_up_state(p, TASK_NORMAL);
	put_task_struct(p);
Linus Torvalds's avatar
Linus Torvalds committed
752 753
}

754 755 756 757 758 759 760 761 762
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
{
	struct task_struct *new_owner;
	struct futex_pi_state *pi_state = this->pi_state;
	u32 curval, newval;

	if (!pi_state)
		return -EINVAL;

763
	spin_lock(&pi_state->pi_mutex.wait_lock);
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);

	/*
	 * This happens when we have stolen the lock and the original
	 * pending owner did not enqueue itself back on the rt_mutex.
	 * Thats not a tragedy. We know that way, that a lock waiter
	 * is on the fly. We make the futex_q waiter the pending owner.
	 */
	if (!new_owner)
		new_owner = this->task;

	/*
	 * We pass it to the next owner. (The WAITERS bit is always
	 * kept enabled while there is PI state around. We must also
	 * preserve the owner died bit.)
	 */
780
	if (!(uval & FUTEX_OWNER_DIED)) {
781 782
		int ret = 0;

783
		newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
784

Thomas Gleixner's avatar
Thomas Gleixner committed
785
		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
786

787
		if (curval == -EFAULT)
788
			ret = -EFAULT;
789
		else if (curval != uval)
790 791 792 793 794
			ret = -EINVAL;
		if (ret) {
			spin_unlock(&pi_state->pi_mutex.wait_lock);
			return ret;
		}
795
	}
796

797 798 799 800 801 802 803
	spin_lock_irq(&pi_state->owner->pi_lock);
	WARN_ON(list_empty(&pi_state->list));
	list_del_init(&pi_state->list);
	spin_unlock_irq(&pi_state->owner->pi_lock);

	spin_lock_irq(&new_owner->pi_lock);
	WARN_ON(!list_empty(&pi_state->list));
804 805
	list_add(&pi_state->list, &new_owner->pi_state_list);
	pi_state->owner = new_owner;
806 807
	spin_unlock_irq(&new_owner->pi_lock);

808
	spin_unlock(&pi_state->pi_mutex.wait_lock);
809 810 811 812 813 814 815 816 817 818 819 820 821
	rt_mutex_unlock(&pi_state->pi_mutex);

	return 0;
}

static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
	u32 oldval;

	/*
	 * There is no waiter, so we unlock the futex. The owner died
	 * bit has not to be preserved here. We are the owner:
	 */
Thomas Gleixner's avatar
Thomas Gleixner committed
822
	oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
823 824 825 826 827 828 829 830 831

	if (oldval == -EFAULT)
		return oldval;
	if (oldval != uval)
		return -EAGAIN;

	return 0;
}

Ingo Molnar's avatar
Ingo Molnar committed
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
/*
 * Express the locking dependencies for lockdep:
 */
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
	if (hb1 <= hb2) {
		spin_lock(&hb1->lock);
		if (hb1 < hb2)
			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
	} else { /* hb1 > hb2 */
		spin_lock(&hb2->lock);
		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
	}
}

Darren Hart's avatar
Darren Hart committed
848 849 850
static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
851
	spin_unlock(&hb1->lock);
852 853
	if (hb1 != hb2)
		spin_unlock(&hb2->lock);
Darren Hart's avatar
Darren Hart committed
854 855
}

Linus Torvalds's avatar
Linus Torvalds committed
856
/*
Darren Hart's avatar
Darren Hart committed
857
 * Wake up waiters matching bitset queued on this futex (uaddr).
Linus Torvalds's avatar
Linus Torvalds committed
858
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
859
static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
Linus Torvalds's avatar
Linus Torvalds committed
860
{
861
	struct futex_hash_bucket *hb;
Linus Torvalds's avatar
Linus Torvalds committed
862
	struct futex_q *this, *next;
Pierre Peiffer's avatar
Pierre Peiffer committed
863
	struct plist_head *head;
864
	union futex_key key = FUTEX_KEY_INIT;
Linus Torvalds's avatar
Linus Torvalds committed
865 866
	int ret;

867 868 869
	if (!bitset)
		return -EINVAL;

870
	ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
Linus Torvalds's avatar
Linus Torvalds committed
871 872 873
	if (unlikely(ret != 0))
		goto out;

874 875 876
	hb = hash_futex(&key);
	spin_lock(&hb->lock);
	head = &hb->chain;
Linus Torvalds's avatar
Linus Torvalds committed
877

Pierre Peiffer's avatar
Pierre Peiffer committed
878
	plist_for_each_entry_safe(this, next, head, list) {
Linus Torvalds's avatar
Linus Torvalds committed
879
		if (match_futex (&this->key, &key)) {
880
			if (this->pi_state || this->rt_waiter) {
881 882 883
				ret = -EINVAL;
				break;
			}
884 885 886 887 888

			/* Check if one of the bits is set in both bitsets */
			if (!(this->bitset & bitset))
				continue;

Linus Torvalds's avatar
Linus Torvalds committed
889 890 891 892 893 894
			wake_futex(this);
			if (++ret >= nr_wake)
				break;
		}
	}

895
	spin_unlock(&hb->lock);
896
	put_futex_key(fshared, &key);
897
out:
Linus Torvalds's avatar
Linus Torvalds committed
898 899 900
	return ret;
}

901 902 903 904
/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
905
static int
Peter Zijlstra's avatar
Peter Zijlstra committed
906
futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
907
	      int nr_wake, int nr_wake2, int op)
908
{
909
	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
910
	struct futex_hash_bucket *hb1, *hb2;
Pierre Peiffer's avatar
Pierre Peiffer committed
911
	struct plist_head *head;
912
	struct futex_q *this, *next;
Darren Hart's avatar
Darren Hart committed
913
	int ret, op_ret;
914

Darren Hart's avatar
Darren Hart committed
915
retry:
916
	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
917 918
	if (unlikely(ret != 0))
		goto out;
919
	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
920
	if (unlikely(ret != 0))
921
		goto out_put_key1;
922

923 924
	hb1 = hash_futex(&key1);
	hb2 = hash_futex(&key2);
925

Darren Hart's avatar
Darren Hart committed
926
retry_private:
Thomas Gleixner's avatar
Thomas Gleixner committed
927
	double_lock_hb(hb1, hb2);
928
	op_ret = futex_atomic_op_inuser(op, uaddr2);
929 930
	if (unlikely(op_ret < 0)) {

Darren Hart's avatar
Darren Hart committed
931
		double_unlock_hb(hb1, hb2);
932

933
#ifndef CONFIG_MMU
934 935 936 937
		/*
		 * we don't get EFAULT from MMU faults if we don't have an MMU,
		 * but we might get them from range checking
		 */
938
		ret = op_ret;
939
		goto out_put_keys;
940 941
#endif

942 943
		if (unlikely(op_ret != -EFAULT)) {
			ret = op_ret;
944
			goto out_put_keys;
945 946
		}

947
		ret = fault_in_user_writeable(uaddr2);
948
		if (ret)
949
			goto out_put_keys;
950

Darren Hart's avatar
Darren Hart committed
951 952 953
		if (!fshared)
			goto retry_private;

954 955
		put_futex_key(fshared, &key2);
		put_futex_key(fshared, &key1);
Darren Hart's avatar
Darren Hart committed
956
		goto retry;
957 958
	}

959
	head = &hb1->chain;
960

Pierre Peiffer's avatar
Pierre Peiffer committed
961
	plist_for_each_entry_safe(this, next, head, list) {
962 963 964 965 966 967 968 969
		if (match_futex (&this->key, &key1)) {
			wake_futex(this);
			if (++ret >= nr_wake)
				break;
		}
	}

	if (op_ret > 0) {
970
		head = &hb2->chain;
971 972

		op_ret = 0;
Pierre Peiffer's avatar
Pierre Peiffer committed
973
		plist_for_each_entry_safe(this, next, head, list) {
974 975 976 977 978 979 980 981 982
			if (match_futex (&this->key, &key2)) {
				wake_futex(this);
				if (++op_ret >= nr_wake2)
					break;
			}
		}
		ret += op_ret;
	}

Darren Hart's avatar
Darren Hart committed
983
	double_unlock_hb(hb1, hb2);
984
out_put_keys:
985
	put_futex_key(fshared, &key2);
986
out_put_key1:
987
	put_futex_key(fshared, &key1);
988
out:
989 990 991
	return ret;
}

Darren Hart's avatar
Darren Hart committed
992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
/**
 * requeue_futex() - Requeue a futex_q from one hb to another
 * @q:		the futex_q to requeue
 * @hb1:	the source hash_bucket
 * @hb2:	the target hash_bucket
 * @key2:	the new key for the requeued futex_q
 */
static inline
void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
		   struct futex_hash_bucket *hb2, union futex_key *key2)
{

	/*
	 * If key1 and key2 hash to the same bucket, no need to
	 * requeue.
	 */
	if (likely(&hb1->chain != &hb2->chain)) {
		plist_del(&q->list, &hb1->chain);
		plist_add(&q->list, &hb2->chain);
		q->lock_ptr = &hb2->lock;
#ifdef CONFIG_DEBUG_PI_LIST
		q->list.plist.lock = &hb2->lock;
#endif
	}
	get_futex_key_refs(key2);
	q->key = *key2;
}

1020 1021
/**
 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1022 1023 1024
 * @q:		the futex_q
 * @key:	the key of the requeue target futex
 * @hb:		the hash_bucket of the requeue target futex
1025 1026 1027 1028 1029
 *
 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
 * to the requeue target futex so the waiter can detect the wakeup on the right
 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1030 1031 1032
 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
 * to protect access to the pi_state to fixup the owner later.  Must be called
 * with both q->lock_ptr and hb->lock held.
1033 1034
 */
static inline
1035 1036
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
			   struct futex_hash_bucket *hb)
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
{
	get_futex_key_refs(key);
	q->key = *key;

	WARN_ON(plist_node_empty(&q->list));
	plist_del(&q->list, &q->list.plist);

	WARN_ON(!q->rt_waiter);
	q->rt_waiter = NULL;

1047 1048 1049 1050 1051
	q->lock_ptr = &hb->lock;
#ifdef CONFIG_DEBUG_PI_LIST
	q->list.plist.lock = &hb->lock;
#endif

Thomas Gleixner's avatar
Thomas Gleixner committed
1052
	wake_up_state(q->task, TASK_NORMAL);
1053 1054 1055 1056
}

/**
 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1057 1058 1059 1060 1061 1062 1063
 * @pifutex:		the user address of the to futex
 * @hb1:		the from futex hash bucket, must be locked by the caller
 * @hb2:		the to futex hash bucket, must be locked by the caller
 * @key1:		the from futex key
 * @key2:		the to futex key
 * @ps:			address to store the pi_state pointer
 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1064 1065
 *
 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1066 1067 1068
 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
 * hb1 and hb2 must be held by the caller.
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
 *
 * Returns:
 *  0 - failed to acquire the lock atomicly
 *  1 - acquired the lock
 * <0 - error
 */
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
				 struct futex_hash_bucket *hb1,
				 struct futex_hash_bucket *hb2,
				 union futex_key *key1, union futex_key *key2,
1079
				 struct futex_pi_state **ps, int set_waiters)
1080
{
1081
	struct futex_q *top_waiter = NULL;
1082 1083 1084 1085 1086 1087
	u32 curval;
	int ret;

	if (get_futex_value_locked(&curval, pifutex))
		return -EFAULT;

1088 1089 1090 1091 1092 1093 1094 1095
	/*
	 * Find the top_waiter and determine if there are additional waiters.
	 * If the caller intends to requeue more than 1 waiter to pifutex,
	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
	 * as we have means to handle the possible fault.  If not, don't set
	 * the bit unecessarily as it will force the subsequent unlock to enter
	 * the kernel.
	 */
1096 1097 1098 1099 1100 1101
	top_waiter = futex_top_waiter(hb1, key1);

	/* There are no waiters, nothing for us to do. */
	if (!top_waiter)
		return 0;

1102 1103 1104 1105
	/* Ensure we requeue to the expected futex. */
	if (!match_futex(top_waiter->requeue_pi_key, key2))
		return -EINVAL;

1106
	/*
1107 1108 1109
	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
	 * the contended case or if set_waiters is 1.  The pi_state is returned
	 * in ps in contended cases.
1110
	 */
1111 1112
	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
				   set_waiters);
1113
	if (ret == 1)
1114
		requeue_pi_wake_futex(top_waiter, key2, hb2);
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133

	return ret;
}

/**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
 * uaddr1:	source futex user address
 * uaddr2:	target futex user address
 * nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
 * nr_requeue:	number of waiters to requeue (0-INT_MAX)
 * requeue_pi:	if we are attempting to requeue from a non-pi futex to a
 * 		pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
 *
 * Returns:
 * >=0 - on success, the number of tasks requeued or woken
 *  <0 - on error
Linus Torvalds's avatar
Linus Torvalds committed
1134
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1135
static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
1136 1137
			 int nr_wake, int nr_requeue, u32 *cmpval,
			 int requeue_pi)
Linus Torvalds's avatar
Linus Torvalds committed
1138
{
1139
	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1140 1141
	int drop_count = 0, task_count = 0, ret;
	struct futex_pi_state *pi_state = NULL;
1142
	struct futex_hash_bucket *hb1, *hb2;
Pierre Peiffer's avatar
Pierre Peiffer committed
1143
	struct plist_head *head1;
Linus Torvalds's avatar
Linus Torvalds committed
1144
	struct futex_q *this, *next;
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	u32 curval2;

	if (requeue_pi) {
		/*
		 * requeue_pi requires a pi_state, try to allocate it now
		 * without any locks in case it fails.
		 */
		if (refill_pi_state_cache())
			return -ENOMEM;
		/*
		 * requeue_pi must wake as many tasks as it can, up to nr_wake
		 * + nr_requeue, since it acquires the rt_mutex prior to
		 * returning to userspace, so as to not leave the rt_mutex with
		 * waiters and no owner.  However, second and third wake-ups
		 * cannot be predicted as they involve race conditions with the
		 * first wake and a fault while looking up the pi_state.  Both
		 * pthread_cond_signal() and pthread_cond_broadcast() should
		 * use nr_wake=1.
		 */
		if (nr_wake != 1)
			return -EINVAL;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1167

1168
retry:
1169 1170 1171 1172 1173 1174 1175 1176 1177
	if (pi_state != NULL) {
		/*
		 * We will have to lookup the pi_state again, so free this one
		 * to keep the accounting correct.
		 */
		free_pi_state(pi_state);
		pi_state = NULL;
	}

1178
	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
Linus Torvalds's avatar
Linus Torvalds committed
1179 1180
	if (unlikely(ret != 0))
		goto out;
1181 1182
	ret = get_futex_key(uaddr2, fshared, &key2,
			    requeue_pi ? VERIFY_WRITE : VERIFY_READ);
Linus Torvalds's avatar
Linus Torvalds committed
1183
	if (unlikely(ret != 0))
1184
		goto out_put_key1;
Linus Torvalds's avatar
Linus Torvalds committed
1185

1186 1187
	hb1 = hash_futex(&key1);
	hb2 = hash_futex(&key2);
Linus Torvalds's avatar
Linus Torvalds committed
1188

Darren Hart's avatar
Darren Hart committed
1189
retry_private:
Ingo Molnar's avatar
Ingo Molnar committed
1190
	double_lock_hb(hb1, hb2);
Linus Torvalds's avatar
Linus Torvalds committed
1191

1192 1193
	if (likely(cmpval != NULL)) {
		u32 curval;
Linus Torvalds's avatar
Linus Torvalds committed
1194

1195
		ret = get_futex_value_locked(&curval, uaddr1);
Linus Torvalds's avatar
Linus Torvalds committed
1196 1197

		if (unlikely(ret)) {
Darren Hart's avatar
Darren Hart committed
1198
			double_unlock_hb(hb1, hb2);
Linus Torvalds's avatar
Linus Torvalds committed
1199

1200
			ret = get_user(curval, uaddr1);
Darren Hart's avatar
Darren Hart committed
1201 1202
			if (ret)
				goto out_put_keys;
Linus Torvalds's avatar
Linus Torvalds committed
1203

Darren Hart's avatar
Darren Hart committed
1204 1205
			if (!fshared)
				goto retry_private;
Linus Torvalds's avatar
Linus Torvalds committed
1206

Darren Hart's avatar
Darren Hart committed
1207 1208 1209
			put_futex_key(fshared, &key2);
			put_futex_key(fshared, &key1);
			goto retry;
Linus Torvalds's avatar
Linus Torvalds committed
1210
		}
1211
		if (curval != *cmpval) {
Linus Torvalds's avatar
Linus Torvalds committed
1212 1213 1214 1215 1216
			ret = -EAGAIN;
			goto out_unlock;
		}
	}

1217
	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1218 1219 1220 1221 1222 1223
		/*
		 * Attempt to acquire uaddr2 and wake the top waiter. If we
		 * intend to requeue waiters, force setting the FUTEX_WAITERS
		 * bit.  We force this here where we are able to easily handle
		 * faults rather in the requeue loop below.
		 */
1224
		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1225
						 &key2, &pi_state, nr_requeue);
1226 1227 1228 1229 1230 1231 1232 1233 1234

		/*
		 * At this point the top_waiter has either taken uaddr2 or is
		 * waiting on it.  If the former, then the pi_state will not
		 * exist yet, look it up one more time to ensure we have a
		 * reference to it.
		 */
		if (ret == 1) {
			WARN_ON(pi_state);
1235
			drop_count++;
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
			task_count++;
			ret = get_futex_value_locked(&curval2, uaddr2);
			if (!ret)
				ret = lookup_pi_state(curval2, hb2, &key2,
						      &pi_state);
		}

		switch (ret) {
		case 0:
			break;
		case -EFAULT:
			double_unlock_hb(hb1, hb2);
			put_futex_key(fshared, &key2);
			put_futex_key(fshared, &key1);
1250
			ret = fault_in_user_writeable(uaddr2);
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
			if (!ret)
				goto retry;
			goto out;
		case -EAGAIN:
			/* The owner was exiting, try again. */
			double_unlock_hb(hb1, hb2);
			put_futex_key(fshared, &key2);
			put_futex_key(fshared, &key1);
			cond_resched();
			goto retry;
		default:
			goto out_unlock;
		}
	}

1266
	head1 = &hb1->chain;
Pierre Peiffer's avatar
Pierre Peiffer committed
1267
	plist_for_each_entry_safe(this, next, head1, list) {