1/*
2 *  linux/fs/namei.c
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 */
6
7/*
8 * Some corrections by tytso.
9 */
10
11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12 * lookup logic.
13 */
14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15 */
16
17#include <linux/init.h>
18#include <linux/export.h>
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/fs.h>
22#include <linux/namei.h>
23#include <linux/pagemap.h>
24#include <linux/fsnotify.h>
25#include <linux/personality.h>
26#include <linux/security.h>
27#include <linux/ima.h>
28#include <linux/syscalls.h>
29#include <linux/mount.h>
30#include <linux/audit.h>
31#include <linux/capability.h>
32#include <linux/file.h>
33#include <linux/fcntl.h>
34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h>
36#include <linux/posix_acl.h>
37#include <linux/hash.h>
38#include <asm/uaccess.h>
39
40#include "internal.h"
41#include "mount.h"
42
43/* [Feb-1997 T. Schoebel-Theuer]
44 * Fundamental changes in the pathname lookup mechanisms (namei)
45 * were necessary because of omirr.  The reason is that omirr needs
46 * to know the _real_ pathname, not the user-supplied one, in case
47 * of symlinks (and also when transname replacements occur).
48 *
49 * The new code replaces the old recursive symlink resolution with
50 * an iterative one (in case of non-nested symlink chains).  It does
51 * this with calls to <fs>_follow_link().
52 * As a side effect, dir_namei(), _namei() and follow_link() are now
53 * replaced with a single function lookup_dentry() that can handle all
54 * the special cases of the former code.
55 *
56 * With the new dcache, the pathname is stored at each inode, at least as
57 * long as the refcount of the inode is positive.  As a side effect, the
58 * size of the dcache depends on the inode cache and thus is dynamic.
59 *
60 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
61 * resolution to correspond with current state of the code.
62 *
63 * Note that the symlink resolution is not *completely* iterative.
64 * There is still a significant amount of tail- and mid- recursion in
65 * the algorithm.  Also, note that <fs>_readlink() is not used in
66 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
67 * may return different results than <fs>_follow_link().  Many virtual
68 * filesystems (including /proc) exhibit this behavior.
69 */
70
71/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
72 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
73 * and the name already exists in form of a symlink, try to create the new
74 * name indicated by the symlink. The old code always complained that the
75 * name already exists, due to not following the symlink even if its target
76 * is nonexistent.  The new semantics affects also mknod() and link() when
77 * the name is a symlink pointing to a non-existent name.
78 *
79 * I don't know which semantics is the right one, since I have no access
80 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
81 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
82 * "old" one. Personally, I think the new semantics is much more logical.
83 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
84 * file does succeed in both HP-UX and SunOs, but not in Solaris
85 * and in the old Linux semantics.
86 */
87
88/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
89 * semantics.  See the comments in "open_namei" and "do_link" below.
90 *
91 * [10-Sep-98 Alan Modra] Another symlink change.
92 */
93
94/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
95 *	inside the path - always follow.
96 *	in the last component in creation/removal/renaming - never follow.
97 *	if LOOKUP_FOLLOW passed - follow.
98 *	if the pathname has trailing slashes - follow.
99 *	otherwise - don't follow.
100 * (applied in that order).
101 *
102 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
103 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
104 * During the 2.4 we need to fix the userland stuff depending on it -
105 * hopefully we will be able to get rid of that wart in 2.5. So far only
106 * XEmacs seems to be relying on it...
107 */
108/*
109 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
110 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
111 * any extra contention...
112 */
113
114/* In order to reduce some races, while at the same time doing additional
115 * checking and hopefully speeding things up, we copy filenames to the
116 * kernel data space before using them..
117 *
118 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
119 * PATH_MAX includes the nul terminator --RR.
120 */
121
122#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
123
124struct filename *
125getname_flags(const char __user *filename, int flags, int *empty)
126{
127	struct filename *result;
128	char *kname;
129	int len;
130
131	result = audit_reusename(filename);
132	if (result)
133		return result;
134
135	result = __getname();
136	if (unlikely(!result))
137		return ERR_PTR(-ENOMEM);
138
139	/*
140	 * First, try to embed the struct filename inside the names_cache
141	 * allocation
142	 */
143	kname = (char *)result->iname;
144	result->name = kname;
145
146	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
147	if (unlikely(len < 0)) {
148		__putname(result);
149		return ERR_PTR(len);
150	}
151
152	/*
153	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
154	 * separate struct filename so we can dedicate the entire
155	 * names_cache allocation for the pathname, and re-do the copy from
156	 * userland.
157	 */
158	if (unlikely(len == EMBEDDED_NAME_MAX)) {
159		const size_t size = offsetof(struct filename, iname[1]);
160		kname = (char *)result;
161
162		/*
163		 * size is chosen that way we to guarantee that
164		 * result->iname[0] is within the same object and that
165		 * kname can't be equal to result->iname, no matter what.
166		 */
167		result = kzalloc(size, GFP_KERNEL);
168		if (unlikely(!result)) {
169			__putname(kname);
170			return ERR_PTR(-ENOMEM);
171		}
172		result->name = kname;
173		len = strncpy_from_user(kname, filename, PATH_MAX);
174		if (unlikely(len < 0)) {
175			__putname(kname);
176			kfree(result);
177			return ERR_PTR(len);
178		}
179		if (unlikely(len == PATH_MAX)) {
180			__putname(kname);
181			kfree(result);
182			return ERR_PTR(-ENAMETOOLONG);
183		}
184	}
185
186	result->refcnt = 1;
187	/* The empty path is special. */
188	if (unlikely(!len)) {
189		if (empty)
190			*empty = 1;
191		if (!(flags & LOOKUP_EMPTY)) {
192			putname(result);
193			return ERR_PTR(-ENOENT);
194		}
195	}
196
197	result->uptr = filename;
198	result->aname = NULL;
199	audit_getname(result);
200	return result;
201}
202
203struct filename *
204getname(const char __user * filename)
205{
206	return getname_flags(filename, 0, NULL);
207}
208
209struct filename *
210getname_kernel(const char * filename)
211{
212	struct filename *result;
213	int len = strlen(filename) + 1;
214
215	result = __getname();
216	if (unlikely(!result))
217		return ERR_PTR(-ENOMEM);
218
219	if (len <= EMBEDDED_NAME_MAX) {
220		result->name = (char *)result->iname;
221	} else if (len <= PATH_MAX) {
222		struct filename *tmp;
223
224		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
225		if (unlikely(!tmp)) {
226			__putname(result);
227			return ERR_PTR(-ENOMEM);
228		}
229		tmp->name = (char *)result;
230		result = tmp;
231	} else {
232		__putname(result);
233		return ERR_PTR(-ENAMETOOLONG);
234	}
235	memcpy((char *)result->name, filename, len);
236	result->uptr = NULL;
237	result->aname = NULL;
238	result->refcnt = 1;
239	audit_getname(result);
240
241	return result;
242}
243
244void putname(struct filename *name)
245{
246	BUG_ON(name->refcnt <= 0);
247
248	if (--name->refcnt > 0)
249		return;
250
251	if (name->name != name->iname) {
252		__putname(name->name);
253		kfree(name);
254	} else
255		__putname(name);
256}
257
258static int check_acl(struct inode *inode, int mask)
259{
260#ifdef CONFIG_FS_POSIX_ACL
261	struct posix_acl *acl;
262
263	if (mask & MAY_NOT_BLOCK) {
264		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
265	        if (!acl)
266	                return -EAGAIN;
267		/* no ->get_acl() calls in RCU mode... */
268		if (acl == ACL_NOT_CACHED)
269			return -ECHILD;
270	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
271	}
272
273	acl = get_acl(inode, ACL_TYPE_ACCESS);
274	if (IS_ERR(acl))
275		return PTR_ERR(acl);
276	if (acl) {
277	        int error = posix_acl_permission(inode, acl, mask);
278	        posix_acl_release(acl);
279	        return error;
280	}
281#endif
282
283	return -EAGAIN;
284}
285
286/*
287 * This does the basic permission checking
288 */
289static int acl_permission_check(struct inode *inode, int mask)
290{
291	unsigned int mode = inode->i_mode;
292
293	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
294		mode >>= 6;
295	else {
296		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
297			int error = check_acl(inode, mask);
298			if (error != -EAGAIN)
299				return error;
300		}
301
302		if (in_group_p(inode->i_gid))
303			mode >>= 3;
304	}
305
306	/*
307	 * If the DACs are ok we don't need any capability check.
308	 */
309	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
310		return 0;
311	return -EACCES;
312}
313
314/**
315 * generic_permission -  check for access rights on a Posix-like filesystem
316 * @inode:	inode to check access rights for
317 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
318 *
319 * Used to check for read/write/execute permissions on a file.
320 * We use "fsuid" for this, letting us set arbitrary permissions
321 * for filesystem access without changing the "normal" uids which
322 * are used for other things.
323 *
324 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
325 * request cannot be satisfied (eg. requires blocking or too much complexity).
326 * It would then be called again in ref-walk mode.
327 */
328int generic_permission(struct inode *inode, int mask)
329{
330	int ret;
331
332	/*
333	 * Do the basic permission checks.
334	 */
335	ret = acl_permission_check(inode, mask);
336	if (ret != -EACCES)
337		return ret;
338
339	if (S_ISDIR(inode->i_mode)) {
340		/* DACs are overridable for directories */
341		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
342			return 0;
343		if (!(mask & MAY_WRITE))
344			if (capable_wrt_inode_uidgid(inode,
345						     CAP_DAC_READ_SEARCH))
346				return 0;
347		return -EACCES;
348	}
349	/*
350	 * Read/write DACs are always overridable.
351	 * Executable DACs are overridable when there is
352	 * at least one exec bit set.
353	 */
354	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
355		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
356			return 0;
357
358	/*
359	 * Searching includes executable on directories, else just read.
360	 */
361	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
362	if (mask == MAY_READ)
363		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
364			return 0;
365
366	return -EACCES;
367}
368EXPORT_SYMBOL(generic_permission);
369
370/*
371 * We _really_ want to just do "generic_permission()" without
372 * even looking at the inode->i_op values. So we keep a cache
373 * flag in inode->i_opflags, that says "this has not special
374 * permission function, use the fast case".
375 */
376static inline int do_inode_permission(struct inode *inode, int mask)
377{
378	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
379		if (likely(inode->i_op->permission))
380			return inode->i_op->permission(inode, mask);
381
382		/* This gets set once for the inode lifetime */
383		spin_lock(&inode->i_lock);
384		inode->i_opflags |= IOP_FASTPERM;
385		spin_unlock(&inode->i_lock);
386	}
387	return generic_permission(inode, mask);
388}
389
390/**
391 * __inode_permission - Check for access rights to a given inode
392 * @inode: Inode to check permission on
393 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
394 *
395 * Check for read/write/execute permissions on an inode.
396 *
397 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
398 *
399 * This does not check for a read-only file system.  You probably want
400 * inode_permission().
401 */
402int __inode_permission(struct inode *inode, int mask)
403{
404	int retval;
405
406	if (unlikely(mask & MAY_WRITE)) {
407		/*
408		 * Nobody gets write access to an immutable file.
409		 */
410		if (IS_IMMUTABLE(inode))
411			return -EACCES;
412	}
413
414	retval = do_inode_permission(inode, mask);
415	if (retval)
416		return retval;
417
418	retval = devcgroup_inode_permission(inode, mask);
419	if (retval)
420		return retval;
421
422	return security_inode_permission(inode, mask);
423}
424EXPORT_SYMBOL(__inode_permission);
425
426/**
427 * sb_permission - Check superblock-level permissions
428 * @sb: Superblock of inode to check permission on
429 * @inode: Inode to check permission on
430 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
431 *
432 * Separate out file-system wide checks from inode-specific permission checks.
433 */
434static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
435{
436	if (unlikely(mask & MAY_WRITE)) {
437		umode_t mode = inode->i_mode;
438
439		/* Nobody gets write access to a read-only fs. */
440		if ((sb->s_flags & MS_RDONLY) &&
441		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
442			return -EROFS;
443	}
444	return 0;
445}
446
447/**
448 * inode_permission - Check for access rights to a given inode
449 * @inode: Inode to check permission on
450 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
451 *
452 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
453 * this, letting us set arbitrary permissions for filesystem access without
454 * changing the "normal" UIDs which are used for other things.
455 *
456 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
457 */
458int inode_permission(struct inode *inode, int mask)
459{
460	int retval;
461
462	retval = sb_permission(inode->i_sb, inode, mask);
463	if (retval)
464		return retval;
465	return __inode_permission(inode, mask);
466}
467EXPORT_SYMBOL(inode_permission);
468
469/**
470 * path_get - get a reference to a path
471 * @path: path to get the reference to
472 *
473 * Given a path increment the reference count to the dentry and the vfsmount.
474 */
475void path_get(const struct path *path)
476{
477	mntget(path->mnt);
478	dget(path->dentry);
479}
480EXPORT_SYMBOL(path_get);
481
482/**
483 * path_put - put a reference to a path
484 * @path: path to put the reference to
485 *
486 * Given a path decrement the reference count to the dentry and the vfsmount.
487 */
488void path_put(const struct path *path)
489{
490	dput(path->dentry);
491	mntput(path->mnt);
492}
493EXPORT_SYMBOL(path_put);
494
495#define EMBEDDED_LEVELS 2
496struct nameidata {
497	struct path	path;
498	struct qstr	last;
499	struct path	root;
500	struct inode	*inode; /* path.dentry.d_inode */
501	unsigned int	flags;
502	unsigned	seq, m_seq;
503	int		last_type;
504	unsigned	depth;
505	int		total_link_count;
506	struct saved {
507		struct path link;
508		void *cookie;
509		const char *name;
510		struct inode *inode;
511		unsigned seq;
512	} *stack, internal[EMBEDDED_LEVELS];
513	struct filename	*name;
514	struct nameidata *saved;
515	unsigned	root_seq;
516	int		dfd;
517};
518
519static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
520{
521	struct nameidata *old = current->nameidata;
522	p->stack = p->internal;
523	p->dfd = dfd;
524	p->name = name;
525	p->total_link_count = old ? old->total_link_count : 0;
526	p->saved = old;
527	current->nameidata = p;
528}
529
530static void restore_nameidata(void)
531{
532	struct nameidata *now = current->nameidata, *old = now->saved;
533
534	current->nameidata = old;
535	if (old)
536		old->total_link_count = now->total_link_count;
537	if (now->stack != now->internal) {
538		kfree(now->stack);
539		now->stack = now->internal;
540	}
541}
542
543static int __nd_alloc_stack(struct nameidata *nd)
544{
545	struct saved *p;
546
547	if (nd->flags & LOOKUP_RCU) {
548		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
549				  GFP_ATOMIC);
550		if (unlikely(!p))
551			return -ECHILD;
552	} else {
553		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
554				  GFP_KERNEL);
555		if (unlikely(!p))
556			return -ENOMEM;
557	}
558	memcpy(p, nd->internal, sizeof(nd->internal));
559	nd->stack = p;
560	return 0;
561}
562
563/**
564 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
565 * @path: nameidate to verify
566 *
567 * Rename can sometimes move a file or directory outside of a bind
568 * mount, path_connected allows those cases to be detected.
569 */
570static bool path_connected(const struct path *path)
571{
572	struct vfsmount *mnt = path->mnt;
573
574	/* Only bind mounts can have disconnected paths */
575	if (mnt->mnt_root == mnt->mnt_sb->s_root)
576		return true;
577
578	return is_subdir(path->dentry, mnt->mnt_root);
579}
580
581static inline int nd_alloc_stack(struct nameidata *nd)
582{
583	if (likely(nd->depth != EMBEDDED_LEVELS))
584		return 0;
585	if (likely(nd->stack != nd->internal))
586		return 0;
587	return __nd_alloc_stack(nd);
588}
589
590static void drop_links(struct nameidata *nd)
591{
592	int i = nd->depth;
593	while (i--) {
594		struct saved *last = nd->stack + i;
595		struct inode *inode = last->inode;
596		if (last->cookie && inode->i_op->put_link) {
597			inode->i_op->put_link(inode, last->cookie);
598			last->cookie = NULL;
599		}
600	}
601}
602
603static void terminate_walk(struct nameidata *nd)
604{
605	drop_links(nd);
606	if (!(nd->flags & LOOKUP_RCU)) {
607		int i;
608		path_put(&nd->path);
609		for (i = 0; i < nd->depth; i++)
610			path_put(&nd->stack[i].link);
611		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
612			path_put(&nd->root);
613			nd->root.mnt = NULL;
614		}
615	} else {
616		nd->flags &= ~LOOKUP_RCU;
617		if (!(nd->flags & LOOKUP_ROOT))
618			nd->root.mnt = NULL;
619		rcu_read_unlock();
620	}
621	nd->depth = 0;
622}
623
624/* path_put is needed afterwards regardless of success or failure */
625static bool legitimize_path(struct nameidata *nd,
626			    struct path *path, unsigned seq)
627{
628	int res = __legitimize_mnt(path->mnt, nd->m_seq);
629	if (unlikely(res)) {
630		if (res > 0)
631			path->mnt = NULL;
632		path->dentry = NULL;
633		return false;
634	}
635	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
636		path->dentry = NULL;
637		return false;
638	}
639	return !read_seqcount_retry(&path->dentry->d_seq, seq);
640}
641
642static bool legitimize_links(struct nameidata *nd)
643{
644	int i;
645	for (i = 0; i < nd->depth; i++) {
646		struct saved *last = nd->stack + i;
647		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
648			drop_links(nd);
649			nd->depth = i + 1;
650			return false;
651		}
652	}
653	return true;
654}
655
656/*
657 * Path walking has 2 modes, rcu-walk and ref-walk (see
658 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
659 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
660 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
661 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
662 * got stuck, so ref-walk may continue from there. If this is not successful
663 * (eg. a seqcount has changed), then failure is returned and it's up to caller
664 * to restart the path walk from the beginning in ref-walk mode.
665 */
666
667/**
668 * unlazy_walk - try to switch to ref-walk mode.
669 * @nd: nameidata pathwalk data
670 * @dentry: child of nd->path.dentry or NULL
671 * @seq: seq number to check dentry against
672 * Returns: 0 on success, -ECHILD on failure
673 *
674 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
675 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
676 * @nd or NULL.  Must be called from rcu-walk context.
677 * Nothing should touch nameidata between unlazy_walk() failure and
678 * terminate_walk().
679 */
680static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
681{
682	struct dentry *parent = nd->path.dentry;
683
684	BUG_ON(!(nd->flags & LOOKUP_RCU));
685
686	nd->flags &= ~LOOKUP_RCU;
687	if (unlikely(!legitimize_links(nd)))
688		goto out2;
689	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
690		goto out2;
691	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
692		goto out1;
693
694	/*
695	 * For a negative lookup, the lookup sequence point is the parents
696	 * sequence point, and it only needs to revalidate the parent dentry.
697	 *
698	 * For a positive lookup, we need to move both the parent and the
699	 * dentry from the RCU domain to be properly refcounted. And the
700	 * sequence number in the dentry validates *both* dentry counters,
701	 * since we checked the sequence number of the parent after we got
702	 * the child sequence number. So we know the parent must still
703	 * be valid if the child sequence number is still valid.
704	 */
705	if (!dentry) {
706		if (read_seqcount_retry(&parent->d_seq, nd->seq))
707			goto out;
708		BUG_ON(nd->inode != parent->d_inode);
709	} else {
710		if (!lockref_get_not_dead(&dentry->d_lockref))
711			goto out;
712		if (read_seqcount_retry(&dentry->d_seq, seq))
713			goto drop_dentry;
714	}
715
716	/*
717	 * Sequence counts matched. Now make sure that the root is
718	 * still valid and get it if required.
719	 */
720	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
721		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
722			rcu_read_unlock();
723			dput(dentry);
724			return -ECHILD;
725		}
726	}
727
728	rcu_read_unlock();
729	return 0;
730
731drop_dentry:
732	rcu_read_unlock();
733	dput(dentry);
734	goto drop_root_mnt;
735out2:
736	nd->path.mnt = NULL;
737out1:
738	nd->path.dentry = NULL;
739out:
740	rcu_read_unlock();
741drop_root_mnt:
742	if (!(nd->flags & LOOKUP_ROOT))
743		nd->root.mnt = NULL;
744	return -ECHILD;
745}
746
747static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
748{
749	if (unlikely(!legitimize_path(nd, link, seq))) {
750		drop_links(nd);
751		nd->depth = 0;
752		nd->flags &= ~LOOKUP_RCU;
753		nd->path.mnt = NULL;
754		nd->path.dentry = NULL;
755		if (!(nd->flags & LOOKUP_ROOT))
756			nd->root.mnt = NULL;
757		rcu_read_unlock();
758	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
759		return 0;
760	}
761	path_put(link);
762	return -ECHILD;
763}
764
765static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
766{
767	return dentry->d_op->d_revalidate(dentry, flags);
768}
769
770/**
771 * complete_walk - successful completion of path walk
772 * @nd:  pointer nameidata
773 *
774 * If we had been in RCU mode, drop out of it and legitimize nd->path.
775 * Revalidate the final result, unless we'd already done that during
776 * the path walk or the filesystem doesn't ask for it.  Return 0 on
777 * success, -error on failure.  In case of failure caller does not
778 * need to drop nd->path.
779 */
780static int complete_walk(struct nameidata *nd)
781{
782	struct dentry *dentry = nd->path.dentry;
783	int status;
784
785	if (nd->flags & LOOKUP_RCU) {
786		if (!(nd->flags & LOOKUP_ROOT))
787			nd->root.mnt = NULL;
788		if (unlikely(unlazy_walk(nd, NULL, 0)))
789			return -ECHILD;
790	}
791
792	if (likely(!(nd->flags & LOOKUP_JUMPED)))
793		return 0;
794
795	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
796		return 0;
797
798	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
799	if (status > 0)
800		return 0;
801
802	if (!status)
803		status = -ESTALE;
804
805	return status;
806}
807
808static void set_root(struct nameidata *nd)
809{
810	get_fs_root(current->fs, &nd->root);
811}
812
813static void set_root_rcu(struct nameidata *nd)
814{
815	struct fs_struct *fs = current->fs;
816	unsigned seq;
817
818	do {
819		seq = read_seqcount_begin(&fs->seq);
820		nd->root = fs->root;
821		nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
822	} while (read_seqcount_retry(&fs->seq, seq));
823}
824
825static void path_put_conditional(struct path *path, struct nameidata *nd)
826{
827	dput(path->dentry);
828	if (path->mnt != nd->path.mnt)
829		mntput(path->mnt);
830}
831
832static inline void path_to_nameidata(const struct path *path,
833					struct nameidata *nd)
834{
835	if (!(nd->flags & LOOKUP_RCU)) {
836		dput(nd->path.dentry);
837		if (nd->path.mnt != path->mnt)
838			mntput(nd->path.mnt);
839	}
840	nd->path.mnt = path->mnt;
841	nd->path.dentry = path->dentry;
842}
843
844/*
845 * Helper to directly jump to a known parsed path from ->follow_link,
846 * caller must have taken a reference to path beforehand.
847 */
848void nd_jump_link(struct path *path)
849{
850	struct nameidata *nd = current->nameidata;
851	path_put(&nd->path);
852
853	nd->path = *path;
854	nd->inode = nd->path.dentry->d_inode;
855	nd->flags |= LOOKUP_JUMPED;
856}
857
858static inline void put_link(struct nameidata *nd)
859{
860	struct saved *last = nd->stack + --nd->depth;
861	struct inode *inode = last->inode;
862	if (last->cookie && inode->i_op->put_link)
863		inode->i_op->put_link(inode, last->cookie);
864	if (!(nd->flags & LOOKUP_RCU))
865		path_put(&last->link);
866}
867
868int sysctl_protected_symlinks __read_mostly = 0;
869int sysctl_protected_hardlinks __read_mostly = 0;
870
871/**
872 * may_follow_link - Check symlink following for unsafe situations
873 * @nd: nameidata pathwalk data
874 *
875 * In the case of the sysctl_protected_symlinks sysctl being enabled,
876 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
877 * in a sticky world-writable directory. This is to protect privileged
878 * processes from failing races against path names that may change out
879 * from under them by way of other users creating malicious symlinks.
880 * It will permit symlinks to be followed only when outside a sticky
881 * world-writable directory, or when the uid of the symlink and follower
882 * match, or when the directory owner matches the symlink's owner.
883 *
884 * Returns 0 if following the symlink is allowed, -ve on error.
885 */
886static inline int may_follow_link(struct nameidata *nd)
887{
888	const struct inode *inode;
889	const struct inode *parent;
890
891	if (!sysctl_protected_symlinks)
892		return 0;
893
894	/* Allowed if owner and follower match. */
895	inode = nd->stack[0].inode;
896	if (uid_eq(current_cred()->fsuid, inode->i_uid))
897		return 0;
898
899	/* Allowed if parent directory not sticky and world-writable. */
900	parent = nd->inode;
901	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
902		return 0;
903
904	/* Allowed if parent directory and link owner match. */
905	if (uid_eq(parent->i_uid, inode->i_uid))
906		return 0;
907
908	if (nd->flags & LOOKUP_RCU)
909		return -ECHILD;
910
911	audit_log_link_denied("follow_link", &nd->stack[0].link);
912	return -EACCES;
913}
914
915/**
916 * safe_hardlink_source - Check for safe hardlink conditions
917 * @inode: the source inode to hardlink from
918 *
919 * Return false if at least one of the following conditions:
920 *    - inode is not a regular file
921 *    - inode is setuid
922 *    - inode is setgid and group-exec
923 *    - access failure for read and write
924 *
925 * Otherwise returns true.
926 */
927static bool safe_hardlink_source(struct inode *inode)
928{
929	umode_t mode = inode->i_mode;
930
931	/* Special files should not get pinned to the filesystem. */
932	if (!S_ISREG(mode))
933		return false;
934
935	/* Setuid files should not get pinned to the filesystem. */
936	if (mode & S_ISUID)
937		return false;
938
939	/* Executable setgid files should not get pinned to the filesystem. */
940	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
941		return false;
942
943	/* Hardlinking to unreadable or unwritable sources is dangerous. */
944	if (inode_permission(inode, MAY_READ | MAY_WRITE))
945		return false;
946
947	return true;
948}
949
950/**
951 * may_linkat - Check permissions for creating a hardlink
952 * @link: the source to hardlink from
953 *
954 * Block hardlink when all of:
955 *  - sysctl_protected_hardlinks enabled
956 *  - fsuid does not match inode
957 *  - hardlink source is unsafe (see safe_hardlink_source() above)
958 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
959 *
960 * Returns 0 if successful, -ve on error.
961 */
962static int may_linkat(struct path *link)
963{
964	struct inode *inode;
965
966	if (!sysctl_protected_hardlinks)
967		return 0;
968
969	inode = link->dentry->d_inode;
970
971	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
972	 * otherwise, it must be a safe source.
973	 */
974	if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
975		return 0;
976
977	audit_log_link_denied("linkat", link);
978	return -EPERM;
979}
980
981static __always_inline
982const char *get_link(struct nameidata *nd)
983{
984	struct saved *last = nd->stack + nd->depth - 1;
985	struct dentry *dentry = last->link.dentry;
986	struct inode *inode = last->inode;
987	int error;
988	const char *res;
989
990	if (!(nd->flags & LOOKUP_RCU)) {
991		touch_atime(&last->link);
992		cond_resched();
993	} else if (atime_needs_update(&last->link, inode)) {
994		if (unlikely(unlazy_walk(nd, NULL, 0)))
995			return ERR_PTR(-ECHILD);
996		touch_atime(&last->link);
997	}
998
999	error = security_inode_follow_link(dentry, inode,
1000					   nd->flags & LOOKUP_RCU);
1001	if (unlikely(error))
1002		return ERR_PTR(error);
1003
1004	nd->last_type = LAST_BIND;
1005	res = inode->i_link;
1006	if (!res) {
1007		if (nd->flags & LOOKUP_RCU) {
1008			if (unlikely(unlazy_walk(nd, NULL, 0)))
1009				return ERR_PTR(-ECHILD);
1010		}
1011		res = inode->i_op->follow_link(dentry, &last->cookie);
1012		if (IS_ERR_OR_NULL(res)) {
1013			last->cookie = NULL;
1014			return res;
1015		}
1016	}
1017	if (*res == '/') {
1018		if (nd->flags & LOOKUP_RCU) {
1019			struct dentry *d;
1020			if (!nd->root.mnt)
1021				set_root_rcu(nd);
1022			nd->path = nd->root;
1023			d = nd->path.dentry;
1024			nd->inode = d->d_inode;
1025			nd->seq = nd->root_seq;
1026			if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
1027				return ERR_PTR(-ECHILD);
1028		} else {
1029			if (!nd->root.mnt)
1030				set_root(nd);
1031			path_put(&nd->path);
1032			nd->path = nd->root;
1033			path_get(&nd->root);
1034			nd->inode = nd->path.dentry->d_inode;
1035		}
1036		nd->flags |= LOOKUP_JUMPED;
1037		while (unlikely(*++res == '/'))
1038			;
1039	}
1040	if (!*res)
1041		res = NULL;
1042	return res;
1043}
1044
1045/*
1046 * follow_up - Find the mountpoint of path's vfsmount
1047 *
1048 * Given a path, find the mountpoint of its source file system.
1049 * Replace @path with the path of the mountpoint in the parent mount.
1050 * Up is towards /.
1051 *
1052 * Return 1 if we went up a level and 0 if we were already at the
1053 * root.
1054 */
1055int follow_up(struct path *path)
1056{
1057	struct mount *mnt = real_mount(path->mnt);
1058	struct mount *parent;
1059	struct dentry *mountpoint;
1060
1061	read_seqlock_excl(&mount_lock);
1062	parent = mnt->mnt_parent;
1063	if (parent == mnt) {
1064		read_sequnlock_excl(&mount_lock);
1065		return 0;
1066	}
1067	mntget(&parent->mnt);
1068	mountpoint = dget(mnt->mnt_mountpoint);
1069	read_sequnlock_excl(&mount_lock);
1070	dput(path->dentry);
1071	path->dentry = mountpoint;
1072	mntput(path->mnt);
1073	path->mnt = &parent->mnt;
1074	return 1;
1075}
1076EXPORT_SYMBOL(follow_up);
1077
1078/*
1079 * Perform an automount
1080 * - return -EISDIR to tell follow_managed() to stop and return the path we
1081 *   were called with.
1082 */
1083static int follow_automount(struct path *path, struct nameidata *nd,
1084			    bool *need_mntput)
1085{
1086	struct vfsmount *mnt;
1087	int err;
1088
1089	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
1090		return -EREMOTE;
1091
1092	/* We don't want to mount if someone's just doing a stat -
1093	 * unless they're stat'ing a directory and appended a '/' to
1094	 * the name.
1095	 *
1096	 * We do, however, want to mount if someone wants to open or
1097	 * create a file of any type under the mountpoint, wants to
1098	 * traverse through the mountpoint or wants to open the
1099	 * mounted directory.  Also, autofs may mark negative dentries
1100	 * as being automount points.  These will need the attentions
1101	 * of the daemon to instantiate them before they can be used.
1102	 */
1103	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1104			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1105	    path->dentry->d_inode)
1106		return -EISDIR;
1107
1108	nd->total_link_count++;
1109	if (nd->total_link_count >= 40)
1110		return -ELOOP;
1111
1112	mnt = path->dentry->d_op->d_automount(path);
1113	if (IS_ERR(mnt)) {
1114		/*
1115		 * The filesystem is allowed to return -EISDIR here to indicate
1116		 * it doesn't want to automount.  For instance, autofs would do
1117		 * this so that its userspace daemon can mount on this dentry.
1118		 *
1119		 * However, we can only permit this if it's a terminal point in
1120		 * the path being looked up; if it wasn't then the remainder of
1121		 * the path is inaccessible and we should say so.
1122		 */
1123		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1124			return -EREMOTE;
1125		return PTR_ERR(mnt);
1126	}
1127
1128	if (!mnt) /* mount collision */
1129		return 0;
1130
1131	if (!*need_mntput) {
1132		/* lock_mount() may release path->mnt on error */
1133		mntget(path->mnt);
1134		*need_mntput = true;
1135	}
1136	err = finish_automount(mnt, path);
1137
1138	switch (err) {
1139	case -EBUSY:
1140		/* Someone else made a mount here whilst we were busy */
1141		return 0;
1142	case 0:
1143		path_put(path);
1144		path->mnt = mnt;
1145		path->dentry = dget(mnt->mnt_root);
1146		return 0;
1147	default:
1148		return err;
1149	}
1150
1151}
1152
1153/*
1154 * Handle a dentry that is managed in some way.
1155 * - Flagged for transit management (autofs)
1156 * - Flagged as mountpoint
1157 * - Flagged as automount point
1158 *
1159 * This may only be called in refwalk mode.
1160 *
1161 * Serialization is taken care of in namespace.c
1162 */
1163static int follow_managed(struct path *path, struct nameidata *nd)
1164{
1165	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1166	unsigned managed;
1167	bool need_mntput = false;
1168	int ret = 0;
1169
1170	/* Given that we're not holding a lock here, we retain the value in a
1171	 * local variable for each dentry as we look at it so that we don't see
1172	 * the components of that value change under us */
1173	while (managed = ACCESS_ONCE(path->dentry->d_flags),
1174	       managed &= DCACHE_MANAGED_DENTRY,
1175	       unlikely(managed != 0)) {
1176		/* Allow the filesystem to manage the transit without i_mutex
1177		 * being held. */
1178		if (managed & DCACHE_MANAGE_TRANSIT) {
1179			BUG_ON(!path->dentry->d_op);
1180			BUG_ON(!path->dentry->d_op->d_manage);
1181			ret = path->dentry->d_op->d_manage(path->dentry, false);
1182			if (ret < 0)
1183				break;
1184		}
1185
1186		/* Transit to a mounted filesystem. */
1187		if (managed & DCACHE_MOUNTED) {
1188			struct vfsmount *mounted = lookup_mnt(path);
1189			if (mounted) {
1190				dput(path->dentry);
1191				if (need_mntput)
1192					mntput(path->mnt);
1193				path->mnt = mounted;
1194				path->dentry = dget(mounted->mnt_root);
1195				need_mntput = true;
1196				continue;
1197			}
1198
1199			/* Something is mounted on this dentry in another
1200			 * namespace and/or whatever was mounted there in this
1201			 * namespace got unmounted before lookup_mnt() could
1202			 * get it */
1203		}
1204
1205		/* Handle an automount point */
1206		if (managed & DCACHE_NEED_AUTOMOUNT) {
1207			ret = follow_automount(path, nd, &need_mntput);
1208			if (ret < 0)
1209				break;
1210			continue;
1211		}
1212
1213		/* We didn't change the current path point */
1214		break;
1215	}
1216
1217	if (need_mntput && path->mnt == mnt)
1218		mntput(path->mnt);
1219	if (ret == -EISDIR)
1220		ret = 0;
1221	if (need_mntput)
1222		nd->flags |= LOOKUP_JUMPED;
1223	if (unlikely(ret < 0))
1224		path_put_conditional(path, nd);
1225	return ret;
1226}
1227
1228int follow_down_one(struct path *path)
1229{
1230	struct vfsmount *mounted;
1231
1232	mounted = lookup_mnt(path);
1233	if (mounted) {
1234		dput(path->dentry);
1235		mntput(path->mnt);
1236		path->mnt = mounted;
1237		path->dentry = dget(mounted->mnt_root);
1238		return 1;
1239	}
1240	return 0;
1241}
1242EXPORT_SYMBOL(follow_down_one);
1243
1244static inline int managed_dentry_rcu(struct dentry *dentry)
1245{
1246	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1247		dentry->d_op->d_manage(dentry, true) : 0;
1248}
1249
1250/*
1251 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1252 * we meet a managed dentry that would need blocking.
1253 */
1254static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1255			       struct inode **inode, unsigned *seqp)
1256{
1257	for (;;) {
1258		struct mount *mounted;
1259		/*
1260		 * Don't forget we might have a non-mountpoint managed dentry
1261		 * that wants to block transit.
1262		 */
1263		switch (managed_dentry_rcu(path->dentry)) {
1264		case -ECHILD:
1265		default:
1266			return false;
1267		case -EISDIR:
1268			return true;
1269		case 0:
1270			break;
1271		}
1272
1273		if (!d_mountpoint(path->dentry))
1274			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1275
1276		mounted = __lookup_mnt(path->mnt, path->dentry);
1277		if (!mounted)
1278			break;
1279		path->mnt = &mounted->mnt;
1280		path->dentry = mounted->mnt.mnt_root;
1281		nd->flags |= LOOKUP_JUMPED;
1282		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1283		/*
1284		 * Update the inode too. We don't need to re-check the
1285		 * dentry sequence number here after this d_inode read,
1286		 * because a mount-point is always pinned.
1287		 */
1288		*inode = path->dentry->d_inode;
1289	}
1290	return !read_seqretry(&mount_lock, nd->m_seq) &&
1291		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1292}
1293
1294static int follow_dotdot_rcu(struct nameidata *nd)
1295{
1296	struct inode *inode = nd->inode;
1297	if (!nd->root.mnt)
1298		set_root_rcu(nd);
1299
1300	while (1) {
1301		if (path_equal(&nd->path, &nd->root))
1302			break;
1303		if (nd->path.dentry != nd->path.mnt->mnt_root) {
1304			struct dentry *old = nd->path.dentry;
1305			struct dentry *parent = old->d_parent;
1306			unsigned seq;
1307
1308			inode = parent->d_inode;
1309			seq = read_seqcount_begin(&parent->d_seq);
1310			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1311				return -ECHILD;
1312			nd->path.dentry = parent;
1313			nd->seq = seq;
1314			if (unlikely(!path_connected(&nd->path)))
1315				return -ENOENT;
1316			break;
1317		} else {
1318			struct mount *mnt = real_mount(nd->path.mnt);
1319			struct mount *mparent = mnt->mnt_parent;
1320			struct dentry *mountpoint = mnt->mnt_mountpoint;
1321			struct inode *inode2 = mountpoint->d_inode;
1322			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
1323			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1324				return -ECHILD;
1325			if (&mparent->mnt == nd->path.mnt)
1326				break;
1327			/* we know that mountpoint was pinned */
1328			nd->path.dentry = mountpoint;
1329			nd->path.mnt = &mparent->mnt;
1330			inode = inode2;
1331			nd->seq = seq;
1332		}
1333	}
1334	while (unlikely(d_mountpoint(nd->path.dentry))) {
1335		struct mount *mounted;
1336		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1337		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1338			return -ECHILD;
1339		if (!mounted)
1340			break;
1341		nd->path.mnt = &mounted->mnt;
1342		nd->path.dentry = mounted->mnt.mnt_root;
1343		inode = nd->path.dentry->d_inode;
1344		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1345	}
1346	nd->inode = inode;
1347	return 0;
1348}
1349
1350/*
1351 * Follow down to the covering mount currently visible to userspace.  At each
1352 * point, the filesystem owning that dentry may be queried as to whether the
1353 * caller is permitted to proceed or not.
1354 */
1355int follow_down(struct path *path)
1356{
1357	unsigned managed;
1358	int ret;
1359
1360	while (managed = ACCESS_ONCE(path->dentry->d_flags),
1361	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1362		/* Allow the filesystem to manage the transit without i_mutex
1363		 * being held.
1364		 *
1365		 * We indicate to the filesystem if someone is trying to mount
1366		 * something here.  This gives autofs the chance to deny anyone
1367		 * other than its daemon the right to mount on its
1368		 * superstructure.
1369		 *
1370		 * The filesystem may sleep at this point.
1371		 */
1372		if (managed & DCACHE_MANAGE_TRANSIT) {
1373			BUG_ON(!path->dentry->d_op);
1374			BUG_ON(!path->dentry->d_op->d_manage);
1375			ret = path->dentry->d_op->d_manage(
1376				path->dentry, false);
1377			if (ret < 0)
1378				return ret == -EISDIR ? 0 : ret;
1379		}
1380
1381		/* Transit to a mounted filesystem. */
1382		if (managed & DCACHE_MOUNTED) {
1383			struct vfsmount *mounted = lookup_mnt(path);
1384			if (!mounted)
1385				break;
1386			dput(path->dentry);
1387			mntput(path->mnt);
1388			path->mnt = mounted;
1389			path->dentry = dget(mounted->mnt_root);
1390			continue;
1391		}
1392
1393		/* Don't handle automount points here */
1394		break;
1395	}
1396	return 0;
1397}
1398EXPORT_SYMBOL(follow_down);
1399
1400/*
1401 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1402 */
1403static void follow_mount(struct path *path)
1404{
1405	while (d_mountpoint(path->dentry)) {
1406		struct vfsmount *mounted = lookup_mnt(path);
1407		if (!mounted)
1408			break;
1409		dput(path->dentry);
1410		mntput(path->mnt);
1411		path->mnt = mounted;
1412		path->dentry = dget(mounted->mnt_root);
1413	}
1414}
1415
1416static int follow_dotdot(struct nameidata *nd)
1417{
1418	if (!nd->root.mnt)
1419		set_root(nd);
1420
1421	while(1) {
1422		struct dentry *old = nd->path.dentry;
1423
1424		if (nd->path.dentry == nd->root.dentry &&
1425		    nd->path.mnt == nd->root.mnt) {
1426			break;
1427		}
1428		if (nd->path.dentry != nd->path.mnt->mnt_root) {
1429			/* rare case of legitimate dget_parent()... */
1430			nd->path.dentry = dget_parent(nd->path.dentry);
1431			dput(old);
1432			if (unlikely(!path_connected(&nd->path)))
1433				return -ENOENT;
1434			break;
1435		}
1436		if (!follow_up(&nd->path))
1437			break;
1438	}
1439	follow_mount(&nd->path);
1440	nd->inode = nd->path.dentry->d_inode;
1441	return 0;
1442}
1443
1444/*
1445 * This looks up the name in dcache, possibly revalidates the old dentry and
1446 * allocates a new one if not found or not valid.  In the need_lookup argument
1447 * returns whether i_op->lookup is necessary.
1448 *
1449 * dir->d_inode->i_mutex must be held
1450 */
1451static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1452				    unsigned int flags, bool *need_lookup)
1453{
1454	struct dentry *dentry;
1455	int error;
1456
1457	*need_lookup = false;
1458	dentry = d_lookup(dir, name);
1459	if (dentry) {
1460		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1461			error = d_revalidate(dentry, flags);
1462			if (unlikely(error <= 0)) {
1463				if (error < 0) {
1464					dput(dentry);
1465					return ERR_PTR(error);
1466				} else {
1467					d_invalidate(dentry);
1468					dput(dentry);
1469					dentry = NULL;
1470				}
1471			}
1472		}
1473	}
1474
1475	if (!dentry) {
1476		dentry = d_alloc(dir, name);
1477		if (unlikely(!dentry))
1478			return ERR_PTR(-ENOMEM);
1479
1480		*need_lookup = true;
1481	}
1482	return dentry;
1483}
1484
1485/*
1486 * Call i_op->lookup on the dentry.  The dentry must be negative and
1487 * unhashed.
1488 *
1489 * dir->d_inode->i_mutex must be held
1490 */
1491static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1492				  unsigned int flags)
1493{
1494	struct dentry *old;
1495
1496	/* Don't create child dentry for a dead directory. */
1497	if (unlikely(IS_DEADDIR(dir))) {
1498		dput(dentry);
1499		return ERR_PTR(-ENOENT);
1500	}
1501
1502	old = dir->i_op->lookup(dir, dentry, flags);
1503	if (unlikely(old)) {
1504		dput(dentry);
1505		dentry = old;
1506	}
1507	return dentry;
1508}
1509
1510static struct dentry *__lookup_hash(struct qstr *name,
1511		struct dentry *base, unsigned int flags)
1512{
1513	bool need_lookup;
1514	struct dentry *dentry;
1515
1516	dentry = lookup_dcache(name, base, flags, &need_lookup);
1517	if (!need_lookup)
1518		return dentry;
1519
1520	return lookup_real(base->d_inode, dentry, flags);
1521}
1522
1523/*
1524 *  It's more convoluted than I'd like it to be, but... it's still fairly
1525 *  small and for now I'd prefer to have fast path as straight as possible.
1526 *  It _is_ time-critical.
1527 */
1528static int lookup_fast(struct nameidata *nd,
1529		       struct path *path, struct inode **inode,
1530		       unsigned *seqp)
1531{
1532	struct vfsmount *mnt = nd->path.mnt;
1533	struct dentry *dentry, *parent = nd->path.dentry;
1534	int need_reval = 1;
1535	int status = 1;
1536	int err;
1537
1538	/*
1539	 * Rename seqlock is not required here because in the off chance
1540	 * of a false negative due to a concurrent rename, we're going to
1541	 * do the non-racy lookup, below.
1542	 */
1543	if (nd->flags & LOOKUP_RCU) {
1544		unsigned seq;
1545		bool negative;
1546		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1547		if (!dentry)
1548			goto unlazy;
1549
1550		/*
1551		 * This sequence count validates that the inode matches
1552		 * the dentry name information from lookup.
1553		 */
1554		*inode = d_backing_inode(dentry);
1555		negative = d_is_negative(dentry);
1556		if (read_seqcount_retry(&dentry->d_seq, seq))
1557			return -ECHILD;
1558
1559		/*
1560		 * This sequence count validates that the parent had no
1561		 * changes while we did the lookup of the dentry above.
1562		 *
1563		 * The memory barrier in read_seqcount_begin of child is
1564		 *  enough, we can use __read_seqcount_retry here.
1565		 */
1566		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1567			return -ECHILD;
1568
1569		*seqp = seq;
1570		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1571			status = d_revalidate(dentry, nd->flags);
1572			if (unlikely(status <= 0)) {
1573				if (status != -ECHILD)
1574					need_reval = 0;
1575				goto unlazy;
1576			}
1577		}
1578		/*
1579		 * Note: do negative dentry check after revalidation in
1580		 * case that drops it.
1581		 */
1582		if (negative)
1583			return -ENOENT;
1584		path->mnt = mnt;
1585		path->dentry = dentry;
1586		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1587			return 0;
1588unlazy:
1589		if (unlazy_walk(nd, dentry, seq))
1590			return -ECHILD;
1591	} else {
1592		dentry = __d_lookup(parent, &nd->last);
1593	}
1594
1595	if (unlikely(!dentry))
1596		goto need_lookup;
1597
1598	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1599		status = d_revalidate(dentry, nd->flags);
1600	if (unlikely(status <= 0)) {
1601		if (status < 0) {
1602			dput(dentry);
1603			return status;
1604		}
1605		d_invalidate(dentry);
1606		dput(dentry);
1607		goto need_lookup;
1608	}
1609
1610	if (unlikely(d_is_negative(dentry))) {
1611		dput(dentry);
1612		return -ENOENT;
1613	}
1614	path->mnt = mnt;
1615	path->dentry = dentry;
1616	err = follow_managed(path, nd);
1617	if (likely(!err))
1618		*inode = d_backing_inode(path->dentry);
1619	return err;
1620
1621need_lookup:
1622	return 1;
1623}
1624
1625/* Fast lookup failed, do it the slow way */
1626static int lookup_slow(struct nameidata *nd, struct path *path)
1627{
1628	struct dentry *dentry, *parent;
1629
1630	parent = nd->path.dentry;
1631	BUG_ON(nd->inode != parent->d_inode);
1632
1633	mutex_lock(&parent->d_inode->i_mutex);
1634	dentry = __lookup_hash(&nd->last, parent, nd->flags);
1635	mutex_unlock(&parent->d_inode->i_mutex);
1636	if (IS_ERR(dentry))
1637		return PTR_ERR(dentry);
1638	path->mnt = nd->path.mnt;
1639	path->dentry = dentry;
1640	return follow_managed(path, nd);
1641}
1642
1643static inline int may_lookup(struct nameidata *nd)
1644{
1645	if (nd->flags & LOOKUP_RCU) {
1646		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1647		if (err != -ECHILD)
1648			return err;
1649		if (unlazy_walk(nd, NULL, 0))
1650			return -ECHILD;
1651	}
1652	return inode_permission(nd->inode, MAY_EXEC);
1653}
1654
1655static inline int handle_dots(struct nameidata *nd, int type)
1656{
1657	if (type == LAST_DOTDOT) {
1658		if (nd->flags & LOOKUP_RCU) {
1659			return follow_dotdot_rcu(nd);
1660		} else
1661			return follow_dotdot(nd);
1662	}
1663	return 0;
1664}
1665
1666static int pick_link(struct nameidata *nd, struct path *link,
1667		     struct inode *inode, unsigned seq)
1668{
1669	int error;
1670	struct saved *last;
1671	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1672		path_to_nameidata(link, nd);
1673		return -ELOOP;
1674	}
1675	if (!(nd->flags & LOOKUP_RCU)) {
1676		if (link->mnt == nd->path.mnt)
1677			mntget(link->mnt);
1678	}
1679	error = nd_alloc_stack(nd);
1680	if (unlikely(error)) {
1681		if (error == -ECHILD) {
1682			if (unlikely(unlazy_link(nd, link, seq)))
1683				return -ECHILD;
1684			error = nd_alloc_stack(nd);
1685		}
1686		if (error) {
1687			path_put(link);
1688			return error;
1689		}
1690	}
1691
1692	last = nd->stack + nd->depth++;
1693	last->link = *link;
1694	last->cookie = NULL;
1695	last->inode = inode;
1696	last->seq = seq;
1697	return 1;
1698}
1699
1700/*
1701 * Do we need to follow links? We _really_ want to be able
1702 * to do this check without having to look at inode->i_op,
1703 * so we keep a cache of "no, this doesn't need follow_link"
1704 * for the common case.
1705 */
1706static inline int should_follow_link(struct nameidata *nd, struct path *link,
1707				     int follow,
1708				     struct inode *inode, unsigned seq)
1709{
1710	if (likely(!d_is_symlink(link->dentry)))
1711		return 0;
1712	if (!follow)
1713		return 0;
1714	/* make sure that d_is_symlink above matches inode */
1715	if (nd->flags & LOOKUP_RCU) {
1716		if (read_seqcount_retry(&link->dentry->d_seq, seq))
1717			return -ECHILD;
1718	}
1719	return pick_link(nd, link, inode, seq);
1720}
1721
1722enum {WALK_GET = 1, WALK_PUT = 2};
1723
1724static int walk_component(struct nameidata *nd, int flags)
1725{
1726	struct path path;
1727	struct inode *inode;
1728	unsigned seq;
1729	int err;
1730	/*
1731	 * "." and ".." are special - ".." especially so because it has
1732	 * to be able to know about the current root directory and
1733	 * parent relationships.
1734	 */
1735	if (unlikely(nd->last_type != LAST_NORM)) {
1736		err = handle_dots(nd, nd->last_type);
1737		if (flags & WALK_PUT)
1738			put_link(nd);
1739		return err;
1740	}
1741	err = lookup_fast(nd, &path, &inode, &seq);
1742	if (unlikely(err)) {
1743		if (err < 0)
1744			return err;
1745
1746		err = lookup_slow(nd, &path);
1747		if (err < 0)
1748			return err;
1749
1750		seq = 0;	/* we are already out of RCU mode */
1751		err = -ENOENT;
1752		if (d_is_negative(path.dentry))
1753			goto out_path_put;
1754		inode = d_backing_inode(path.dentry);
1755	}
1756
1757	if (flags & WALK_PUT)
1758		put_link(nd);
1759	err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
1760	if (unlikely(err))
1761		return err;
1762	path_to_nameidata(&path, nd);
1763	nd->inode = inode;
1764	nd->seq = seq;
1765	return 0;
1766
1767out_path_put:
1768	path_to_nameidata(&path, nd);
1769	return err;
1770}
1771
1772/*
1773 * We can do the critical dentry name comparison and hashing
1774 * operations one word at a time, but we are limited to:
1775 *
1776 * - Architectures with fast unaligned word accesses. We could
1777 *   do a "get_unaligned()" if this helps and is sufficiently
1778 *   fast.
1779 *
1780 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1781 *   do not trap on the (extremely unlikely) case of a page
1782 *   crossing operation.
1783 *
1784 * - Furthermore, we need an efficient 64-bit compile for the
1785 *   64-bit case in order to generate the "number of bytes in
1786 *   the final mask". Again, that could be replaced with a
1787 *   efficient population count instruction or similar.
1788 */
1789#ifdef CONFIG_DCACHE_WORD_ACCESS
1790
1791#include <asm/word-at-a-time.h>
1792
1793#ifdef CONFIG_64BIT
1794
1795static inline unsigned int fold_hash(unsigned long hash)
1796{
1797	return hash_64(hash, 32);
1798}
1799
1800#else	/* 32-bit case */
1801
1802#define fold_hash(x) (x)
1803
1804#endif
1805
1806unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1807{
1808	unsigned long a, mask;
1809	unsigned long hash = 0;
1810
1811	for (;;) {
1812		a = load_unaligned_zeropad(name);
1813		if (len < sizeof(unsigned long))
1814			break;
1815		hash += a;
1816		hash *= 9;
1817		name += sizeof(unsigned long);
1818		len -= sizeof(unsigned long);
1819		if (!len)
1820			goto done;
1821	}
1822	mask = bytemask_from_count(len);
1823	hash += mask & a;
1824done:
1825	return fold_hash(hash);
1826}
1827EXPORT_SYMBOL(full_name_hash);
1828
1829/*
1830 * Calculate the length and hash of the path component, and
1831 * return the "hash_len" as the result.
1832 */
1833static inline u64 hash_name(const char *name)
1834{
1835	unsigned long a, b, adata, bdata, mask, hash, len;
1836	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1837
1838	hash = a = 0;
1839	len = -sizeof(unsigned long);
1840	do {
1841		hash = (hash + a) * 9;
1842		len += sizeof(unsigned long);
1843		a = load_unaligned_zeropad(name+len);
1844		b = a ^ REPEAT_BYTE('/');
1845	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
1846
1847	adata = prep_zero_mask(a, adata, &constants);
1848	bdata = prep_zero_mask(b, bdata, &constants);
1849
1850	mask = create_zero_mask(adata | bdata);
1851
1852	hash += a & zero_bytemask(mask);
1853	len += find_zero(mask);
1854	return hashlen_create(fold_hash(hash), len);
1855}
1856
1857#else
1858
1859unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1860{
1861	unsigned long hash = init_name_hash();
1862	while (len--)
1863		hash = partial_name_hash(*name++, hash);
1864	return end_name_hash(hash);
1865}
1866EXPORT_SYMBOL(full_name_hash);
1867
1868/*
1869 * We know there's a real path component here of at least
1870 * one character.
1871 */
1872static inline u64 hash_name(const char *name)
1873{
1874	unsigned long hash = init_name_hash();
1875	unsigned long len = 0, c;
1876
1877	c = (unsigned char)*name;
1878	do {
1879		len++;
1880		hash = partial_name_hash(c, hash);
1881		c = (unsigned char)name[len];
1882	} while (c && c != '/');
1883	return hashlen_create(end_name_hash(hash), len);
1884}
1885
1886#endif
1887
1888/*
1889 * Name resolution.
1890 * This is the basic name resolution function, turning a pathname into
1891 * the final dentry. We expect 'base' to be positive and a directory.
1892 *
1893 * Returns 0 and nd will have valid dentry and mnt on success.
1894 * Returns error and drops reference to input namei data on failure.
1895 */
1896static int link_path_walk(const char *name, struct nameidata *nd)
1897{
1898	int err;
1899
1900	while (*name=='/')
1901		name++;
1902	if (!*name)
1903		return 0;
1904
1905	/* At this point we know we have a real path component. */
1906	for(;;) {
1907		u64 hash_len;
1908		int type;
1909
1910		err = may_lookup(nd);
1911 		if (err)
1912			return err;
1913
1914		hash_len = hash_name(name);
1915
1916		type = LAST_NORM;
1917		if (name[0] == '.') switch (hashlen_len(hash_len)) {
1918			case 2:
1919				if (name[1] == '.') {
1920					type = LAST_DOTDOT;
1921					nd->flags |= LOOKUP_JUMPED;
1922				}
1923				break;
1924			case 1:
1925				type = LAST_DOT;
1926		}
1927		if (likely(type == LAST_NORM)) {
1928			struct dentry *parent = nd->path.dentry;
1929			nd->flags &= ~LOOKUP_JUMPED;
1930			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1931				struct qstr this = { { .hash_len = hash_len }, .name = name };
1932				err = parent->d_op->d_hash(parent, &this);
1933				if (err < 0)
1934					return err;
1935				hash_len = this.hash_len;
1936				name = this.name;
1937			}
1938		}
1939
1940		nd->last.hash_len = hash_len;
1941		nd->last.name = name;
1942		nd->last_type = type;
1943
1944		name += hashlen_len(hash_len);
1945		if (!*name)
1946			goto OK;
1947		/*
1948		 * If it wasn't NUL, we know it was '/'. Skip that
1949		 * slash, and continue until no more slashes.
1950		 */
1951		do {
1952			name++;
1953		} while (unlikely(*name == '/'));
1954		if (unlikely(!*name)) {
1955OK:
1956			/* pathname body, done */
1957			if (!nd->depth)
1958				return 0;
1959			name = nd->stack[nd->depth - 1].name;
1960			/* trailing symlink, done */
1961			if (!name)
1962				return 0;
1963			/* last component of nested symlink */
1964			err = walk_component(nd, WALK_GET | WALK_PUT);
1965		} else {
1966			err = walk_component(nd, WALK_GET);
1967		}
1968		if (err < 0)
1969			return err;
1970
1971		if (err) {
1972			const char *s = get_link(nd);
1973
1974			if (IS_ERR(s))
1975				return PTR_ERR(s);
1976			err = 0;
1977			if (unlikely(!s)) {
1978				/* jumped */
1979				put_link(nd);
1980			} else {
1981				nd->stack[nd->depth - 1].name = name;
1982				name = s;
1983				continue;
1984			}
1985		}
1986		if (unlikely(!d_can_lookup(nd->path.dentry))) {
1987			if (nd->flags & LOOKUP_RCU) {
1988				if (unlazy_walk(nd, NULL, 0))
1989					return -ECHILD;
1990			}
1991			return -ENOTDIR;
1992		}
1993	}
1994}
1995
1996static const char *path_init(struct nameidata *nd, unsigned flags)
1997{
1998	int retval = 0;
1999	const char *s = nd->name->name;
2000
2001	nd->last_type = LAST_ROOT; /* if there are only slashes... */
2002	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
2003	nd->depth = 0;
2004	if (flags & LOOKUP_ROOT) {
2005		struct dentry *root = nd->root.dentry;
2006		struct inode *inode = root->d_inode;
2007		if (*s) {
2008			if (!d_can_lookup(root))
2009				return ERR_PTR(-ENOTDIR);
2010			retval = inode_permission(inode, MAY_EXEC);
2011			if (retval)
2012				return ERR_PTR(retval);
2013		}
2014		nd->path = nd->root;
2015		nd->inode = inode;
2016		if (flags & LOOKUP_RCU) {
2017			rcu_read_lock();
2018			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2019			nd->root_seq = nd->seq;
2020			nd->m_seq = read_seqbegin(&mount_lock);
2021		} else {
2022			path_get(&nd->path);
2023		}
2024		return s;
2025	}
2026
2027	nd->root.mnt = NULL;
2028
2029	nd->m_seq = read_seqbegin(&mount_lock);
2030	if (*s == '/') {
2031		if (flags & LOOKUP_RCU) {
2032			rcu_read_lock();
2033			set_root_rcu(nd);
2034			nd->seq = nd->root_seq;
2035		} else {
2036			set_root(nd);
2037			path_get(&nd->root);
2038		}
2039		nd->path = nd->root;
2040	} else if (nd->dfd == AT_FDCWD) {
2041		if (flags & LOOKUP_RCU) {
2042			struct fs_struct *fs = current->fs;
2043			unsigned seq;
2044
2045			rcu_read_lock();
2046
2047			do {
2048				seq = read_seqcount_begin(&fs->seq);
2049				nd->path = fs->pwd;
2050				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2051			} while (read_seqcount_retry(&fs->seq, seq));
2052		} else {
2053			get_fs_pwd(current->fs, &nd->path);
2054		}
2055	} else {
2056		/* Caller must check execute permissions on the starting path component */
2057		struct fd f = fdget_raw(nd->dfd);
2058		struct dentry *dentry;
2059
2060		if (!f.file)
2061			return ERR_PTR(-EBADF);
2062
2063		dentry = f.file->f_path.dentry;
2064
2065		if (*s) {
2066			if (!d_can_lookup(dentry)) {
2067				fdput(f);
2068				return ERR_PTR(-ENOTDIR);
2069			}
2070		}
2071
2072		nd->path = f.file->f_path;
2073		if (flags & LOOKUP_RCU) {
2074			rcu_read_lock();
2075			nd->inode = nd->path.dentry->d_inode;
2076			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2077		} else {
2078			path_get(&nd->path);
2079			nd->inode = nd->path.dentry->d_inode;
2080		}
2081		fdput(f);
2082		return s;
2083	}
2084
2085	nd->inode = nd->path.dentry->d_inode;
2086	if (!(flags & LOOKUP_RCU))
2087		return s;
2088	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
2089		return s;
2090	if (!(nd->flags & LOOKUP_ROOT))
2091		nd->root.mnt = NULL;
2092	rcu_read_unlock();
2093	return ERR_PTR(-ECHILD);
2094}
2095
2096static const char *trailing_symlink(struct nameidata *nd)
2097{
2098	const char *s;
2099	int error = may_follow_link(nd);
2100	if (unlikely(error))
2101		return ERR_PTR(error);
2102	nd->flags |= LOOKUP_PARENT;
2103	nd->stack[0].name = NULL;
2104	s = get_link(nd);
2105	return s ? s : "";
2106}
2107
2108static inline int lookup_last(struct nameidata *nd)
2109{
2110	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2111		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2112
2113	nd->flags &= ~LOOKUP_PARENT;
2114	return walk_component(nd,
2115			nd->flags & LOOKUP_FOLLOW
2116				? nd->depth
2117					? WALK_PUT | WALK_GET
2118					: WALK_GET
2119				: 0);
2120}
2121
2122/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2123static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2124{
2125	const char *s = path_init(nd, flags);
2126	int err;
2127
2128	if (IS_ERR(s))
2129		return PTR_ERR(s);
2130	while (!(err = link_path_walk(s, nd))
2131		&& ((err = lookup_last(nd)) > 0)) {
2132		s = trailing_symlink(nd);
2133		if (IS_ERR(s)) {
2134			err = PTR_ERR(s);
2135			break;
2136		}
2137	}
2138	if (!err)
2139		err = complete_walk(nd);
2140
2141	if (!err && nd->flags & LOOKUP_DIRECTORY)
2142		if (!d_can_lookup(nd->path.dentry))
2143			err = -ENOTDIR;
2144	if (!err) {
2145		*path = nd->path;
2146		nd->path.mnt = NULL;
2147		nd->path.dentry = NULL;
2148	}
2149	terminate_walk(nd);
2150	return err;
2151}
2152
2153static int filename_lookup(int dfd, struct filename *name, unsigned flags,
2154			   struct path *path, struct path *root)
2155{
2156	int retval;
2157	struct nameidata nd;
2158	if (IS_ERR(name))
2159		return PTR_ERR(name);
2160	if (unlikely(root)) {
2161		nd.root = *root;
2162		flags |= LOOKUP_ROOT;
2163	}
2164	set_nameidata(&nd, dfd, name);
2165	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2166	if (unlikely(retval == -ECHILD))
2167		retval = path_lookupat(&nd, flags, path);
2168	if (unlikely(retval == -ESTALE))
2169		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2170
2171	if (likely(!retval))
2172		audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2173	restore_nameidata();
2174	putname(name);
2175	return retval;
2176}
2177
2178/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2179static int path_parentat(struct nameidata *nd, unsigned flags,
2180				struct path *parent)
2181{
2182	const char *s = path_init(nd, flags);
2183	int err;
2184	if (IS_ERR(s))
2185		return PTR_ERR(s);
2186	err = link_path_walk(s, nd);
2187	if (!err)
2188		err = complete_walk(nd);
2189	if (!err) {
2190		*parent = nd->path;
2191		nd->path.mnt = NULL;
2192		nd->path.dentry = NULL;
2193	}
2194	terminate_walk(nd);
2195	return err;
2196}
2197
2198static struct filename *filename_parentat(int dfd, struct filename *name,
2199				unsigned int flags, struct path *parent,
2200				struct qstr *last, int *type)
2201{
2202	int retval;
2203	struct nameidata nd;
2204
2205	if (IS_ERR(name))
2206		return name;
2207	set_nameidata(&nd, dfd, name);
2208	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2209	if (unlikely(retval == -ECHILD))
2210		retval = path_parentat(&nd, flags, parent);
2211	if (unlikely(retval == -ESTALE))
2212		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2213	if (likely(!retval)) {
2214		*last = nd.last;
2215		*type = nd.last_type;
2216		audit_inode(name, parent->dentry, LOOKUP_PARENT);
2217	} else {
2218		putname(name);
2219		name = ERR_PTR(retval);
2220	}
2221	restore_nameidata();
2222	return name;
2223}
2224
2225/* does lookup, returns the object with parent locked */
2226struct dentry *kern_path_locked(const char *name, struct path *path)
2227{
2228	struct filename *filename;
2229	struct dentry *d;
2230	struct qstr last;
2231	int type;
2232
2233	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2234				    &last, &type);
2235	if (IS_ERR(filename))
2236		return ERR_CAST(filename);
2237	if (unlikely(type != LAST_NORM)) {
2238		path_put(path);
2239		putname(filename);
2240		return ERR_PTR(-EINVAL);
2241	}
2242	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2243	d = __lookup_hash(&last, path->dentry, 0);
2244	if (IS_ERR(d)) {
2245		mutex_unlock(&path->dentry->d_inode->i_mutex);
2246		path_put(path);
2247	}
2248	putname(filename);
2249	return d;
2250}
2251
2252int kern_path(const char *name, unsigned int flags, struct path *path)
2253{
2254	return filename_lookup(AT_FDCWD, getname_kernel(name),
2255			       flags, path, NULL);
2256}
2257EXPORT_SYMBOL(kern_path);
2258
2259/**
2260 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2261 * @dentry:  pointer to dentry of the base directory
2262 * @mnt: pointer to vfs mount of the base directory
2263 * @name: pointer to file name
2264 * @flags: lookup flags
2265 * @path: pointer to struct path to fill
2266 */
2267int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2268		    const char *name, unsigned int flags,
2269		    struct path *path)
2270{
2271	struct path root = {.mnt = mnt, .dentry = dentry};
2272	/* the first argument of filename_lookup() is ignored with root */
2273	return filename_lookup(AT_FDCWD, getname_kernel(name),
2274			       flags , path, &root);
2275}
2276EXPORT_SYMBOL(vfs_path_lookup);
2277
2278/**
2279 * lookup_one_len - filesystem helper to lookup single pathname component
2280 * @name:	pathname component to lookup
2281 * @base:	base directory to lookup from
2282 * @len:	maximum length @len should be interpreted to
2283 *
2284 * Note that this routine is purely a helper for filesystem usage and should
2285 * not be called by generic code.
2286 */
2287struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2288{
2289	struct qstr this;
2290	unsigned int c;
2291	int err;
2292
2293	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
2294
2295	this.name = name;
2296	this.len = len;
2297	this.hash = full_name_hash(name, len);
2298	if (!len)
2299		return ERR_PTR(-EACCES);
2300
2301	if (unlikely(name[0] == '.')) {
2302		if (len < 2 || (len == 2 && name[1] == '.'))
2303			return ERR_PTR(-EACCES);
2304	}
2305
2306	while (len--) {
2307		c = *(const unsigned char *)name++;
2308		if (c == '/' || c == '\0')
2309			return ERR_PTR(-EACCES);
2310	}
2311	/*
2312	 * See if the low-level filesystem might want
2313	 * to use its own hash..
2314	 */
2315	if (base->d_flags & DCACHE_OP_HASH) {
2316		int err = base->d_op->d_hash(base, &this);
2317		if (err < 0)
2318			return ERR_PTR(err);
2319	}
2320
2321	err = inode_permission(base->d_inode, MAY_EXEC);
2322	if (err)
2323		return ERR_PTR(err);
2324
2325	return __lookup_hash(&this, base, 0);
2326}
2327EXPORT_SYMBOL(lookup_one_len);
2328
2329int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2330		 struct path *path, int *empty)
2331{
2332	return filename_lookup(dfd, getname_flags(name, flags, empty),
2333			       flags, path, NULL);
2334}
2335EXPORT_SYMBOL(user_path_at_empty);
2336
2337/*
2338 * NB: most callers don't do anything directly with the reference to the
2339 *     to struct filename, but the nd->last pointer points into the name string
2340 *     allocated by getname. So we must hold the reference to it until all
2341 *     path-walking is complete.
2342 */
2343static inline struct filename *
2344user_path_parent(int dfd, const char __user *path,
2345		 struct path *parent,
2346		 struct qstr *last,
2347		 int *type,
2348		 unsigned int flags)
2349{
2350	/* only LOOKUP_REVAL is allowed in extra flags */
2351	return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
2352				 parent, last, type);
2353}
2354
2355/**
2356 * mountpoint_last - look up last component for umount
2357 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
2358 * @path: pointer to container for result
2359 *
2360 * This is a special lookup_last function just for umount. In this case, we
2361 * need to resolve the path without doing any revalidation.
2362 *
2363 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2364 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2365 * in almost all cases, this lookup will be served out of the dcache. The only
2366 * cases where it won't are if nd->last refers to a symlink or the path is
2367 * bogus and it doesn't exist.
2368 *
2369 * Returns:
2370 * -error: if there was an error during lookup. This includes -ENOENT if the
2371 *         lookup found a negative dentry. The nd->path reference will also be
2372 *         put in this case.
2373 *
2374 * 0:      if we successfully resolved nd->path and found it to not to be a
2375 *         symlink that needs to be followed. "path" will also be populated.
2376 *         The nd->path reference will also be put.
2377 *
2378 * 1:      if we successfully resolved nd->last and found it to be a symlink
2379 *         that needs to be followed. "path" will be populated with the path
2380 *         to the link, and nd->path will *not* be put.
2381 */
2382static int
2383mountpoint_last(struct nameidata *nd, struct path *path)
2384{
2385	int error = 0;
2386	struct dentry *dentry;
2387	struct dentry *dir = nd->path.dentry;
2388
2389	/* If we're in rcuwalk, drop out of it to handle last component */
2390	if (nd->flags & LOOKUP_RCU) {
2391		if (unlazy_walk(nd, NULL, 0))
2392			return -ECHILD;
2393	}
2394
2395	nd->flags &= ~LOOKUP_PARENT;
2396
2397	if (unlikely(nd->last_type != LAST_NORM)) {
2398		error = handle_dots(nd, nd->last_type);
2399		if (error)
2400			return error;
2401		dentry = dget(nd->path.dentry);
2402		goto done;
2403	}
2404
2405	mutex_lock(&dir->d_inode->i_mutex);
2406	dentry = d_lookup(dir, &nd->last);
2407	if (!dentry) {
2408		/*
2409		 * No cached dentry. Mounted dentries are pinned in the cache,
2410		 * so that means that this dentry is probably a symlink or the
2411		 * path doesn't actually point to a mounted dentry.
2412		 */
2413		dentry = d_alloc(dir, &nd->last);
2414		if (!dentry) {
2415			mutex_unlock(&dir->d_inode->i_mutex);
2416			return -ENOMEM;
2417		}
2418		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
2419		if (IS_ERR(dentry)) {
2420			mutex_unlock(&dir->d_inode->i_mutex);
2421			return PTR_ERR(dentry);
2422		}
2423	}
2424	mutex_unlock(&dir->d_inode->i_mutex);
2425
2426done:
2427	if (d_is_negative(dentry)) {
2428		dput(dentry);
2429		return -ENOENT;
2430	}
2431	if (nd->depth)
2432		put_link(nd);
2433	path->dentry = dentry;
2434	path->mnt = nd->path.mnt;
2435	error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
2436				   d_backing_inode(dentry), 0);
2437	if (unlikely(error))
2438		return error;
2439	mntget(path->mnt);
2440	follow_mount(path);
2441	return 0;
2442}
2443
2444/**
2445 * path_mountpoint - look up a path to be umounted
2446 * @nd:		lookup context
2447 * @flags:	lookup flags
2448 * @path:	pointer to container for result
2449 *
2450 * Look up the given name, but don't attempt to revalidate the last component.
2451 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2452 */
2453static int
2454path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2455{
2456	const char *s = path_init(nd, flags);
2457	int err;
2458	if (IS_ERR(s))
2459		return PTR_ERR(s);
2460	while (!(err = link_path_walk(s, nd)) &&
2461		(err = mountpoint_last(nd, path)) > 0) {
2462		s = trailing_symlink(nd);
2463		if (IS_ERR(s)) {
2464			err = PTR_ERR(s);
2465			break;
2466		}
2467	}
2468	terminate_walk(nd);
2469	return err;
2470}
2471
2472static int
2473filename_mountpoint(int dfd, struct filename *name, struct path *path,
2474			unsigned int flags)
2475{
2476	struct nameidata nd;
2477	int error;
2478	if (IS_ERR(name))
2479		return PTR_ERR(name);
2480	set_nameidata(&nd, dfd, name);
2481	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
2482	if (unlikely(error == -ECHILD))
2483		error = path_mountpoint(&nd, flags, path);
2484	if (unlikely(error == -ESTALE))
2485		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
2486	if (likely(!error))
2487		audit_inode(name, path->dentry, 0);
2488	restore_nameidata();
2489	putname(name);
2490	return error;
2491}
2492
2493/**
2494 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2495 * @dfd:	directory file descriptor
2496 * @name:	pathname from userland
2497 * @flags:	lookup flags
2498 * @path:	pointer to container to hold result
2499 *
2500 * A umount is a special case for path walking. We're not actually interested
2501 * in the inode in this situation, and ESTALE errors can be a problem. We
2502 * simply want track down the dentry and vfsmount attached at the mountpoint
2503 * and avoid revalidating the last component.
2504 *
2505 * Returns 0 and populates "path" on success.
2506 */
2507int
2508user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2509			struct path *path)
2510{
2511	return filename_mountpoint(dfd, getname(name), path, flags);
2512}
2513
2514int
2515kern_path_mountpoint(int dfd, const char *name, struct path *path,
2516			unsigned int flags)
2517{
2518	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2519}
2520EXPORT_SYMBOL(kern_path_mountpoint);
2521
2522int __check_sticky(struct inode *dir, struct inode *inode)
2523{
2524	kuid_t fsuid = current_fsuid();
2525
2526	if (uid_eq(inode->i_uid, fsuid))
2527		return 0;
2528	if (uid_eq(dir->i_uid, fsuid))
2529		return 0;
2530	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2531}
2532EXPORT_SYMBOL(__check_sticky);
2533
2534/*
2535 *	Check whether we can remove a link victim from directory dir, check
2536 *  whether the type of victim is right.
2537 *  1. We can't do it if dir is read-only (done in permission())
2538 *  2. We should have write and exec permissions on dir
2539 *  3. We can't remove anything from append-only dir
2540 *  4. We can't do anything with immutable dir (done in permission())
2541 *  5. If the sticky bit on dir is set we should either
2542 *	a. be owner of dir, or
2543 *	b. be owner of victim, or
2544 *	c. have CAP_FOWNER capability
2545 *  6. If the victim is append-only or immutable we can't do antyhing with
2546 *     links pointing to it.
2547 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2548 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2549 *  9. We can't remove a root or mountpoint.
2550 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
2551 *     nfs_async_unlink().
2552 */
2553static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2554{
2555	struct inode *inode = d_backing_inode(victim);
2556	int error;
2557
2558	if (d_is_negative(victim))
2559		return -ENOENT;
2560	BUG_ON(!inode);
2561
2562	BUG_ON(victim->d_parent->d_inode != dir);
2563	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2564
2565	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2566	if (error)
2567		return error;
2568	if (IS_APPEND(dir))
2569		return -EPERM;
2570
2571	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2572	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
2573		return -EPERM;
2574	if (isdir) {
2575		if (!d_is_dir(victim))
2576			return -ENOTDIR;
2577		if (IS_ROOT(victim))
2578			return -EBUSY;
2579	} else if (d_is_dir(victim))
2580		return -EISDIR;
2581	if (IS_DEADDIR(dir))
2582		return -ENOENT;
2583	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2584		return -EBUSY;
2585	return 0;
2586}
2587
2588/*	Check whether we can create an object with dentry child in directory
2589 *  dir.
2590 *  1. We can't do it if child already exists (open has special treatment for
2591 *     this case, but since we are inlined it's OK)
2592 *  2. We can't do it if dir is read-only (done in permission())
2593 *  3. We should have write and exec permissions on dir
2594 *  4. We can't do it if dir is immutable (done in permission())
2595 */
2596static inline int may_create(struct inode *dir, struct dentry *child)
2597{
2598	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2599	if (child->d_inode)
2600		return -EEXIST;
2601	if (IS_DEADDIR(dir))
2602		return -ENOENT;
2603	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2604}
2605
2606/*
2607 * p1 and p2 should be directories on the same fs.
2608 */
2609struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2610{
2611	struct dentry *p;
2612
2613	if (p1 == p2) {
2614		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2615		return NULL;
2616	}
2617
2618	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2619
2620	p = d_ancestor(p2, p1);
2621	if (p) {
2622		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
2623		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
2624		return p;
2625	}
2626
2627	p = d_ancestor(p1, p2);
2628	if (p) {
2629		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2630		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2631		return p;
2632	}
2633
2634	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2635	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
2636	return NULL;
2637}
2638EXPORT_SYMBOL(lock_rename);
2639
2640void unlock_rename(struct dentry *p1, struct dentry *p2)
2641{
2642	mutex_unlock(&p1->d_inode->i_mutex);
2643	if (p1 != p2) {
2644		mutex_unlock(&p2->d_inode->i_mutex);
2645		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2646	}
2647}
2648EXPORT_SYMBOL(unlock_rename);
2649
2650int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2651		bool want_excl)
2652{
2653	int error = may_create(dir, dentry);
2654	if (error)
2655		return error;
2656
2657	if (!dir->i_op->create)
2658		return -EACCES;	/* shouldn't it be ENOSYS? */
2659	mode &= S_IALLUGO;
2660	mode |= S_IFREG;
2661	error = security_inode_create(dir, dentry, mode);
2662	if (error)
2663		return error;
2664	error = dir->i_op->create(dir, dentry, mode, want_excl);
2665	if (!error)
2666		fsnotify_create(dir, dentry);
2667	return error;
2668}
2669EXPORT_SYMBOL(vfs_create);
2670
2671static int may_open(struct path *path, int acc_mode, int flag)
2672{
2673	struct dentry *dentry = path->dentry;
2674	struct inode *inode = dentry->d_inode;
2675	int error;
2676
2677	/* O_PATH? */
2678	if (!acc_mode)
2679		return 0;
2680
2681	if (!inode)
2682		return -ENOENT;
2683
2684	switch (inode->i_mode & S_IFMT) {
2685	case S_IFLNK:
2686		return -ELOOP;
2687	case S_IFDIR:
2688		if (acc_mode & MAY_WRITE)
2689			return -EISDIR;
2690		break;
2691	case S_IFBLK:
2692	case S_IFCHR:
2693		if (path->mnt->mnt_flags & MNT_NODEV)
2694			return -EACCES;
2695		/*FALLTHRU*/
2696	case S_IFIFO:
2697	case S_IFSOCK:
2698		flag &= ~O_TRUNC;
2699		break;
2700	}
2701
2702	error = inode_permission(inode, acc_mode);
2703	if (error)
2704		return error;
2705
2706	/*
2707	 * An append-only file must be opened in append mode for writing.
2708	 */
2709	if (IS_APPEND(inode)) {
2710		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2711			return -EPERM;
2712		if (flag & O_TRUNC)
2713			return -EPERM;
2714	}
2715
2716	/* O_NOATIME can only be set by the owner or superuser */
2717	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2718		return -EPERM;
2719
2720	return 0;
2721}
2722
2723static int handle_truncate(struct file *filp)
2724{
2725	struct path *path = &filp->f_path;
2726	struct inode *inode = path->dentry->d_inode;
2727	int error = get_write_access(inode);
2728	if (error)
2729		return error;
2730	/*
2731	 * Refuse to truncate files with mandatory locks held on them.
2732	 */
2733	error = locks_verify_locked(filp);
2734	if (!error)
2735		error = security_path_truncate(path);
2736	if (!error) {
2737		error = do_truncate(path->dentry, 0,
2738				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2739				    filp);
2740	}
2741	put_write_access(inode);
2742	return error;
2743}
2744
2745static inline int open_to_namei_flags(int flag)
2746{
2747	if ((flag & O_ACCMODE) == 3)
2748		flag--;
2749	return flag;
2750}
2751
2752static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
2753{
2754	int error = security_path_mknod(dir, dentry, mode, 0);
2755	if (error)
2756		return error;
2757
2758	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2759	if (error)
2760		return error;
2761
2762	return security_inode_create(dir->dentry->d_inode, dentry, mode);
2763}
2764
2765/*
2766 * Attempt to atomically look up, create and open a file from a negative
2767 * dentry.
2768 *
2769 * Returns 0 if successful.  The file will have been created and attached to
2770 * @file by the filesystem calling finish_open().
2771 *
2772 * Returns 1 if the file was looked up only or didn't need creating.  The
2773 * caller will need to perform the open themselves.  @path will have been
2774 * updated to point to the new dentry.  This may be negative.
2775 *
2776 * Returns an error code otherwise.
2777 */
2778static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2779			struct path *path, struct file *file,
2780			const struct open_flags *op,
2781			bool got_write, bool need_lookup,
2782			int *opened)
2783{
2784	struct inode *dir =  nd->path.dentry->d_inode;
2785	unsigned open_flag = open_to_namei_flags(op->open_flag);
2786	umode_t mode;
2787	int error;
2788	int acc_mode;
2789	int create_error = 0;
2790	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2791	bool excl;
2792
2793	BUG_ON(dentry->d_inode);
2794
2795	/* Don't create child dentry for a dead directory. */
2796	if (unlikely(IS_DEADDIR(dir))) {
2797		error = -ENOENT;
2798		goto out;
2799	}
2800
2801	mode = op->mode;
2802	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
2803		mode &= ~current_umask();
2804
2805	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
2806	if (excl)
2807		open_flag &= ~O_TRUNC;
2808
2809	/*
2810	 * Checking write permission is tricky, bacuse we don't know if we are
2811	 * going to actually need it: O_CREAT opens should work as long as the
2812	 * file exists.  But checking existence breaks atomicity.  The trick is
2813	 * to check access and if not granted clear O_CREAT from the flags.
2814	 *
2815	 * Another problem is returing the "right" error value (e.g. for an
2816	 * O_EXCL open we want to return EEXIST not EROFS).
2817	 */
2818	if (((open_flag & (O_CREAT | O_TRUNC)) ||
2819	    (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
2820		if (!(open_flag & O_CREAT)) {
2821			/*
2822			 * No O_CREATE -> atomicity not a requirement -> fall
2823			 * back to lookup + open
2824			 */
2825			goto no_open;
2826		} else if (open_flag & (O_EXCL | O_TRUNC)) {
2827			/* Fall back and fail with the right error */
2828			create_error = -EROFS;
2829			goto no_open;
2830		} else {
2831			/* No side effects, safe to clear O_CREAT */
2832			create_error = -EROFS;
2833			open_flag &= ~O_CREAT;
2834		}
2835	}
2836
2837	if (open_flag & O_CREAT) {
2838		error = may_o_create(&nd->path, dentry, mode);
2839		if (error) {
2840			create_error = error;
2841			if (open_flag & O_EXCL)
2842				goto no_open;
2843			open_flag &= ~O_CREAT;
2844		}
2845	}
2846
2847	if (nd->flags & LOOKUP_DIRECTORY)
2848		open_flag |= O_DIRECTORY;
2849
2850	file->f_path.dentry = DENTRY_NOT_SET;
2851	file->f_path.mnt = nd->path.mnt;
2852	error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2853				      opened);
2854	if (error < 0) {
2855		if (create_error && error == -ENOENT)
2856			error = create_error;
2857		goto out;
2858	}
2859
2860	if (error) {	/* returned 1, that is */
2861		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2862			error = -EIO;
2863			goto out;
2864		}
2865		if (file->f_path.dentry) {
2866			dput(dentry);
2867			dentry = file->f_path.dentry;
2868		}
2869		if (*opened & FILE_CREATED)
2870			fsnotify_create(dir, dentry);
2871		if (!dentry->d_inode) {
2872			WARN_ON(*opened & FILE_CREATED);
2873			if (create_error) {
2874				error = create_error;
2875				goto out;
2876			}
2877		} else {
2878			if (excl && !(*opened & FILE_CREATED)) {
2879				error = -EEXIST;
2880				goto out;
2881			}
2882		}
2883		goto looked_up;
2884	}
2885
2886	/*
2887	 * We didn't have the inode before the open, so check open permission
2888	 * here.
2889	 */
2890	acc_mode = op->acc_mode;
2891	if (*opened & FILE_CREATED) {
2892		WARN_ON(!(open_flag & O_CREAT));
2893		fsnotify_create(dir, dentry);
2894		acc_mode = MAY_OPEN;
2895	}
2896	error = may_open(&file->f_path, acc_mode, open_flag);
2897	if (error)
2898		fput(file);
2899
2900out:
2901	dput(dentry);
2902	return error;
2903
2904no_open:
2905	if (need_lookup) {
2906		dentry = lookup_real(dir, dentry, nd->flags);
2907		if (IS_ERR(dentry))
2908			return PTR_ERR(dentry);
2909	}
2910	if (create_error && !dentry->d_inode) {
2911		error = create_error;
2912		goto out;
2913	}
2914looked_up:
2915	path->dentry = dentry;
2916	path->mnt = nd->path.mnt;
2917	return 1;
2918}
2919
2920/*
2921 * Look up and maybe create and open the last component.
2922 *
2923 * Must be called with i_mutex held on parent.
2924 *
2925 * Returns 0 if the file was successfully atomically created (if necessary) and
2926 * opened.  In this case the file will be returned attached to @file.
2927 *
2928 * Returns 1 if the file was not completely opened at this time, though lookups
2929 * and creations will have been performed and the dentry returned in @path will
2930 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
2931 * specified then a negative dentry may be returned.
2932 *
2933 * An error code is returned otherwise.
2934 *
2935 * FILE_CREATE will be set in @*opened if the dentry was created and will be
2936 * cleared otherwise prior to returning.
2937 */
2938static int lookup_open(struct nameidata *nd, struct path *path,
2939			struct file *file,
2940			const struct open_flags *op,
2941			bool got_write, int *opened)
2942{
2943	struct dentry *dir = nd->path.dentry;
2944	struct inode *dir_inode = dir->d_inode;
2945	struct dentry *dentry;
2946	int error;
2947	bool need_lookup;
2948
2949	*opened &= ~FILE_CREATED;
2950	dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
2951	if (IS_ERR(dentry))
2952		return PTR_ERR(dentry);
2953
2954	/* Cached positive dentry: will open in f_op->open */
2955	if (!need_lookup && dentry->d_inode)
2956		goto out_no_open;
2957
2958	if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2959		return atomic_open(nd, dentry, path, file, op, got_write,
2960				   need_lookup, opened);
2961	}
2962
2963	if (need_lookup) {
2964		BUG_ON(dentry->d_inode);
2965
2966		dentry = lookup_real(dir_inode, dentry, nd->flags);
2967		if (IS_ERR(dentry))
2968			return PTR_ERR(dentry);
2969	}
2970
2971	/* Negative dentry, just create the file */
2972	if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
2973		umode_t mode = op->mode;
2974		if (!IS_POSIXACL(dir->d_inode))
2975			mode &= ~current_umask();
2976		/*
2977		 * This write is needed to ensure that a
2978		 * rw->ro transition does not occur between
2979		 * the time when the file is created and when
2980		 * a permanent write count is taken through
2981		 * the 'struct file' in finish_open().
2982		 */
2983		if (!got_write) {
2984			error = -EROFS;
2985			goto out_dput;
2986		}
2987		*opened |= FILE_CREATED;
2988		error = security_path_mknod(&nd->path, dentry, mode, 0);
2989		if (error)
2990			goto out_dput;
2991		error = vfs_create(dir->d_inode, dentry, mode,
2992				   nd->flags & LOOKUP_EXCL);
2993		if (error)
2994			goto out_dput;
2995	}
2996out_no_open:
2997	path->dentry = dentry;
2998	path->mnt = nd->path.mnt;
2999	return 1;
3000
3001out_dput:
3002	dput(dentry);
3003	return error;
3004}
3005
3006/*
3007 * Handle the last step of open()
3008 */
3009static int do_last(struct nameidata *nd,
3010		   struct file *file, const struct open_flags *op,
3011		   int *opened)
3012{
3013	struct dentry *dir = nd->path.dentry;
3014	int open_flag = op->open_flag;
3015	bool will_truncate = (open_flag & O_TRUNC) != 0;
3016	bool got_write = false;
3017	int acc_mode = op->acc_mode;
3018	unsigned seq;
3019	struct inode *inode;
3020	struct path save_parent = { .dentry = NULL, .mnt = NULL };
3021	struct path path;
3022	bool retried = false;
3023	int error;
3024
3025	nd->flags &= ~LOOKUP_PARENT;
3026	nd->flags |= op->intent;
3027
3028	if (nd->last_type != LAST_NORM) {
3029		error = handle_dots(nd, nd->last_type);
3030		if (unlikely(error))
3031			return error;
3032		goto finish_open;
3033	}
3034
3035	if (!(open_flag & O_CREAT)) {
3036		if (nd->last.name[nd->last.len])
3037			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3038		/* we _can_ be in RCU mode here */
3039		error = lookup_fast(nd, &path, &inode, &seq);
3040		if (likely(!error))
3041			goto finish_lookup;
3042
3043		if (error < 0)
3044			return error;
3045
3046		BUG_ON(nd->inode != dir->d_inode);
3047	} else {
3048		/* create side of things */
3049		/*
3050		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
3051		 * has been cleared when we got to the last component we are
3052		 * about to look up
3053		 */
3054		error = complete_walk(nd);
3055		if (error)
3056			return error;
3057
3058		audit_inode(nd->name, dir, LOOKUP_PARENT);
3059		/* trailing slashes? */
3060		if (unlikely(nd->last.name[nd->last.len]))
3061			return -EISDIR;
3062	}
3063
3064retry_lookup:
3065	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3066		error = mnt_want_write(nd->path.mnt);
3067		if (!error)
3068			got_write = true;
3069		/*
3070		 * do _not_ fail yet - we might not need that or fail with
3071		 * a different error; let lookup_open() decide; we'll be
3072		 * dropping this one anyway.
3073		 */
3074	}
3075	mutex_lock(&dir->d_inode->i_mutex);
3076	error = lookup_open(nd, &path, file, op, got_write, opened);
3077	mutex_unlock(&dir->d_inode->i_mutex);
3078
3079	if (error <= 0) {
3080		if (error)
3081			goto out;
3082
3083		if ((*opened & FILE_CREATED) ||
3084		    !S_ISREG(file_inode(file)->i_mode))
3085			will_truncate = false;
3086
3087		audit_inode(nd->name, file->f_path.dentry, 0);
3088		goto opened;
3089	}
3090
3091	if (*opened & FILE_CREATED) {
3092		/* Don't check for write permission, don't truncate */
3093		open_flag &= ~O_TRUNC;
3094		will_truncate = false;
3095		acc_mode = MAY_OPEN;
3096		path_to_nameidata(&path, nd);
3097		goto finish_open_created;
3098	}
3099
3100	/*
3101	 * create/update audit record if it already exists.
3102	 */
3103	if (d_is_positive(path.dentry))
3104		audit_inode(nd->name, path.dentry, 0);
3105
3106	/*
3107	 * If atomic_open() acquired write access it is dropped now due to
3108	 * possible mount and symlink following (this might be optimized away if
3109	 * necessary...)
3110	 */
3111	if (got_write) {
3112		mnt_drop_write(nd->path.mnt);
3113		got_write = false;
3114	}
3115
3116	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
3117		path_to_nameidata(&path, nd);
3118		return -EEXIST;
3119	}
3120
3121	error = follow_managed(&path, nd);
3122	if (unlikely(error < 0))
3123		return error;
3124
3125	BUG_ON(nd->flags & LOOKUP_RCU);
3126	seq = 0;	/* out of RCU mode, so the value doesn't matter */
3127	if (unlikely(d_is_negative(path.dentry))) {
3128		path_to_nameidata(&path, nd);
3129		return -ENOENT;
3130	}
3131	inode = d_backing_inode(path.dentry);
3132finish_lookup:
3133	if (nd->depth)
3134		put_link(nd);
3135	error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
3136				   inode, seq);
3137	if (unlikely(error))
3138		return error;
3139
3140	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
3141		path_to_nameidata(&path, nd);
3142	} else {
3143		save_parent.dentry = nd->path.dentry;
3144		save_parent.mnt = mntget(path.mnt);
3145		nd->path.dentry = path.dentry;
3146
3147	}
3148	nd->inode = inode;
3149	nd->seq = seq;
3150	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3151finish_open:
3152	error = complete_walk(nd);
3153	if (error) {
3154		path_put(&save_parent);
3155		return error;
3156	}
3157	audit_inode(nd->name, nd->path.dentry, 0);
3158	if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
3159		error = -ELOOP;
3160		goto out;
3161	}
3162	error = -EISDIR;
3163	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3164		goto out;
3165	error = -ENOTDIR;
3166	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3167		goto out;
3168	if (!d_is_reg(nd->path.dentry))
3169		will_truncate = false;
3170
3171	if (will_truncate) {
3172		error = mnt_want_write(nd->path.mnt);
3173		if (error)
3174			goto out;
3175		got_write = true;
3176	}
3177finish_open_created:
3178	error = may_open(&nd->path, acc_mode, open_flag);
3179	if (error)
3180		goto out;
3181
3182	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
3183	error = vfs_open(&nd->path, file, current_cred());
3184	if (!error) {
3185		*opened |= FILE_OPENED;
3186	} else {
3187		if (error == -EOPENSTALE)
3188			goto stale_open;
3189		goto out;
3190	}
3191opened:
3192	error = open_check_o_direct(file);
3193	if (error)
3194		goto exit_fput;
3195	error = ima_file_check(file, op->acc_mode, *opened);
3196	if (error)
3197		goto exit_fput;
3198
3199	if (will_truncate) {
3200		error = handle_truncate(file);
3201		if (error)
3202			goto exit_fput;
3203	}
3204out:
3205	if (unlikely(error > 0)) {
3206		WARN_ON(1);
3207		error = -EINVAL;
3208	}
3209	if (got_write)
3210		mnt_drop_write(nd->path.mnt);
3211	path_put(&save_parent);
3212	return error;
3213
3214exit_fput:
3215	fput(file);
3216	goto out;
3217
3218stale_open:
3219	/* If no saved parent or already retried then can't retry */
3220	if (!save_parent.dentry || retried)
3221		goto out;
3222
3223	BUG_ON(save_parent.dentry != dir);
3224	path_put(&nd->path);
3225	nd->path = save_parent;
3226	nd->inode = dir->d_inode;
3227	save_parent.mnt = NULL;
3228	save_parent.dentry = NULL;
3229	if (got_write) {
3230		mnt_drop_write(nd->path.mnt);
3231		got_write = false;
3232	}
3233	retried = true;
3234	goto retry_lookup;
3235}
3236
3237static int do_tmpfile(struct nameidata *nd, unsigned flags,
3238		const struct open_flags *op,
3239		struct file *file, int *opened)
3240{
3241	static const struct qstr name = QSTR_INIT("/", 1);
3242	struct dentry *child;
3243	struct inode *dir;
3244	struct path path;
3245	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3246	if (unlikely(error))
3247		return error;
3248	error = mnt_want_write(path.mnt);
3249	if (unlikely(error))
3250		goto out;
3251	dir = path.dentry->d_inode;
3252	/* we want directory to be writable */
3253	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3254	if (error)
3255		goto out2;
3256	if (!dir->i_op->tmpfile) {
3257		error = -EOPNOTSUPP;
3258		goto out2;
3259	}
3260	child = d_alloc(path.dentry, &name);
3261	if (unlikely(!child)) {
3262		error = -ENOMEM;
3263		goto out2;
3264	}
3265	dput(path.dentry);
3266	path.dentry = child;
3267	error = dir->i_op->tmpfile(dir, child, op->mode);
3268	if (error)
3269		goto out2;
3270	audit_inode(nd->name, child, 0);
3271	/* Don't check for other permissions, the inode was just created */
3272	error = may_open(&path, MAY_OPEN, op->open_flag);
3273	if (error)
3274		goto out2;
3275	file->f_path.mnt = path.mnt;
3276	error = finish_open(file, child, NULL, opened);
3277	if (error)
3278		goto out2;
3279	error = open_check_o_direct(file);
3280	if (error) {
3281		fput(file);
3282	} else if (!(op->open_flag & O_EXCL)) {
3283		struct inode *inode = file_inode(file);
3284		spin_lock(&inode->i_lock);
3285		inode->i_state |= I_LINKABLE;
3286		spin_unlock(&inode->i_lock);
3287	}
3288out2:
3289	mnt_drop_write(path.mnt);
3290out:
3291	path_put(&path);
3292	return error;
3293}
3294
3295static struct file *path_openat(struct nameidata *nd,
3296			const struct open_flags *op, unsigned flags)
3297{
3298	const char *s;
3299	struct file *file;
3300	int opened = 0;
3301	int error;
3302
3303	file = get_empty_filp();
3304	if (IS_ERR(file))
3305		return file;
3306
3307	file->f_flags = op->open_flag;
3308
3309	if (unlikely(file->f_flags & __O_TMPFILE)) {
3310		error = do_tmpfile(nd, flags, op, file, &opened);
3311		goto out2;
3312	}
3313
3314	s = path_init(nd, flags);
3315	if (IS_ERR(s)) {
3316		put_filp(file);
3317		return ERR_CAST(s);
3318	}
3319	while (!(error = link_path_walk(s, nd)) &&
3320		(error = do_last(nd, file, op, &opened)) > 0) {
3321		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3322		s = trailing_symlink(nd);
3323		if (IS_ERR(s)) {
3324			error = PTR_ERR(s);
3325			break;
3326		}
3327	}
3328	terminate_walk(nd);
3329out2:
3330	if (!(opened & FILE_OPENED)) {
3331		BUG_ON(!error);
3332		put_filp(file);
3333	}
3334	if (unlikely(error)) {
3335		if (error == -EOPENSTALE) {
3336			if (flags & LOOKUP_RCU)
3337				error = -ECHILD;
3338			else
3339				error = -ESTALE;
3340		}
3341		file = ERR_PTR(error);
3342	}
3343	return file;
3344}
3345
3346struct file *do_filp_open(int dfd, struct filename *pathname,
3347		const struct open_flags *op)
3348{
3349	struct nameidata nd;
3350	int flags = op->lookup_flags;
3351	struct file *filp;
3352
3353	set_nameidata(&nd, dfd, pathname);
3354	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3355	if (unlikely(filp == ERR_PTR(-ECHILD)))
3356		filp = path_openat(&nd, op, flags);
3357	if (unlikely(filp == ERR_PTR(-ESTALE)))
3358		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3359	restore_nameidata();
3360	return filp;
3361}
3362
3363struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3364		const char *name, const struct open_flags *op)
3365{
3366	struct nameidata nd;
3367	struct file *file;
3368	struct filename *filename;
3369	int flags = op->lookup_flags | LOOKUP_ROOT;
3370
3371	nd.root.mnt = mnt;
3372	nd.root.dentry = dentry;
3373
3374	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3375		return ERR_PTR(-ELOOP);
3376
3377	filename = getname_kernel(name);
3378	if (IS_ERR(filename))
3379		return ERR_CAST(filename);
3380
3381	set_nameidata(&nd, -1, filename);
3382	file = path_openat(&nd, op, flags | LOOKUP_RCU);
3383	if (unlikely(file == ERR_PTR(-ECHILD)))
3384		file = path_openat(&nd, op, flags);
3385	if (unlikely(file == ERR_PTR(-ESTALE)))
3386		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3387	restore_nameidata();
3388	putname(filename);
3389	return file;
3390}
3391
3392static struct dentry *filename_create(int dfd, struct filename *name,
3393				struct path *path, unsigned int lookup_flags)
3394{
3395	struct dentry *dentry = ERR_PTR(-EEXIST);
3396	struct qstr last;
3397	int type;
3398	int err2;
3399	int error;
3400	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3401
3402	/*
3403	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3404	 * other flags passed in are ignored!
3405	 */
3406	lookup_flags &= LOOKUP_REVAL;
3407
3408	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3409	if (IS_ERR(name))
3410		return ERR_CAST(name);
3411
3412	/*
3413	 * Yucky last component or no last component at all?
3414	 * (foo/., foo/.., /////)
3415	 */
3416	if (unlikely(type != LAST_NORM))
3417		goto out;
3418
3419	/* don't fail immediately if it's r/o, at least try to report other errors */
3420	err2 = mnt_want_write(path->mnt);
3421	/*
3422	 * Do the final lookup.
3423	 */
3424	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3425	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3426	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3427	if (IS_ERR(dentry))
3428		goto unlock;
3429
3430	error = -EEXIST;
3431	if (d_is_positive(dentry))
3432		goto fail;
3433
3434	/*
3435	 * Special case - lookup gave negative, but... we had foo/bar/
3436	 * From the vfs_mknod() POV we just have a negative dentry -
3437	 * all is fine. Let's be bastards - you had / on the end, you've
3438	 * been asking for (non-existent) directory. -ENOENT for you.
3439	 */
3440	if (unlikely(!is_dir && last.name[last.len])) {
3441		error = -ENOENT;
3442		goto fail;
3443	}
3444	if (unlikely(err2)) {
3445		error = err2;
3446		goto fail;
3447	}
3448	putname(name);
3449	return dentry;
3450fail:
3451	dput(dentry);
3452	dentry = ERR_PTR(error);
3453unlock:
3454	mutex_unlock(&path->dentry->d_inode->i_mutex);
3455	if (!err2)
3456		mnt_drop_write(path->mnt);
3457out:
3458	path_put(path);
3459	putname(name);
3460	return dentry;
3461}
3462
3463struct dentry *kern_path_create(int dfd, const char *pathname,
3464				struct path *path, unsigned int lookup_flags)
3465{
3466	return filename_create(dfd, getname_kernel(pathname),
3467				path, lookup_flags);
3468}
3469EXPORT_SYMBOL(kern_path_create);
3470
3471void done_path_create(struct path *path, struct dentry *dentry)
3472{
3473	dput(dentry);
3474	mutex_unlock(&path->dentry->d_inode->i_mutex);
3475	mnt_drop_write(path->mnt);
3476	path_put(path);
3477}
3478EXPORT_SYMBOL(done_path_create);
3479
3480inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3481				struct path *path, unsigned int lookup_flags)
3482{
3483	return filename_create(dfd, getname(pathname), path, lookup_flags);
3484}
3485EXPORT_SYMBOL(user_path_create);
3486
3487int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3488{
3489	int error = may_create(dir, dentry);
3490
3491	if (error)
3492		return error;
3493
3494	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3495		return -EPERM;
3496
3497	if (!dir->i_op->mknod)
3498		return -EPERM;
3499
3500	error = devcgroup_inode_mknod(mode, dev);
3501	if (error)
3502		return error;
3503
3504	error = security_inode_mknod(dir, dentry, mode, dev);
3505	if (error)
3506		return error;
3507
3508	error = dir->i_op->mknod(dir, dentry, mode, dev);
3509	if (!error)
3510		fsnotify_create(dir, dentry);
3511	return error;
3512}
3513EXPORT_SYMBOL(vfs_mknod);
3514
3515static int may_mknod(umode_t mode)
3516{
3517	switch (mode & S_IFMT) {
3518	case S_IFREG:
3519	case S_IFCHR:
3520	case S_IFBLK:
3521	case S_IFIFO:
3522	case S_IFSOCK:
3523	case 0: /* zero mode translates to S_IFREG */
3524		return 0;
3525	case S_IFDIR:
3526		return -EPERM;
3527	default:
3528		return -EINVAL;
3529	}
3530}
3531
3532SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3533		unsigned, dev)
3534{
3535	struct dentry *dentry;
3536	struct path path;
3537	int error;
3538	unsigned int lookup_flags = 0;
3539
3540	error = may_mknod(mode);
3541	if (error)
3542		return error;
3543retry:
3544	dentry = user_path_create(dfd, filename, &path, lookup_flags);
3545	if (IS_ERR(dentry))
3546		return PTR_ERR(dentry);
3547
3548	if (!IS_POSIXACL(path.dentry->d_inode))
3549		mode &= ~current_umask();
3550	error = security_path_mknod(&path, dentry, mode, dev);
3551	if (error)
3552		goto out;
3553	switch (mode & S_IFMT) {
3554		case 0: case S_IFREG:
3555			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3556			break;
3557		case S_IFCHR: case S_IFBLK:
3558			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3559					new_decode_dev(dev));
3560			break;
3561		case S_IFIFO: case S_IFSOCK:
3562			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3563			break;
3564	}
3565out:
3566	done_path_create(&path, dentry);
3567	if (retry_estale(error, lookup_flags)) {
3568		lookup_flags |= LOOKUP_REVAL;
3569		goto retry;
3570	}
3571	return error;
3572}
3573
3574SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3575{
3576	return sys_mknodat(AT_FDCWD, filename, mode, dev);
3577}
3578
3579int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3580{
3581	int error = may_create(dir, dentry);
3582	unsigned max_links = dir->i_sb->s_max_links;
3583
3584	if (error)
3585		return error;
3586
3587	if (!dir->i_op->mkdir)
3588		return -EPERM;
3589
3590	mode &= (S_IRWXUGO|S_ISVTX);
3591	error = security_inode_mkdir(dir, dentry, mode);
3592	if (error)
3593		return error;
3594
3595	if (max_links && dir->i_nlink >= max_links)
3596		return -EMLINK;
3597
3598	error = dir->i_op->mkdir(dir, dentry, mode);
3599	if (!error)
3600		fsnotify_mkdir(dir, dentry);
3601	return error;
3602}
3603EXPORT_SYMBOL(vfs_mkdir);
3604
3605SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3606{
3607	struct dentry *dentry;
3608	struct path path;
3609	int error;
3610	unsigned int lookup_flags = LOOKUP_DIRECTORY;
3611
3612retry:
3613	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3614	if (IS_ERR(dentry))
3615		return PTR_ERR(dentry);
3616
3617	if (!IS_POSIXACL(path.dentry->d_inode))
3618		mode &= ~current_umask();
3619	error = security_path_mkdir(&path, dentry, mode);
3620	if (!error)
3621		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3622	done_path_create(&path, dentry);
3623	if (retry_estale(error, lookup_flags)) {
3624		lookup_flags |= LOOKUP_REVAL;
3625		goto retry;
3626	}
3627	return error;
3628}
3629
3630SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3631{
3632	return sys_mkdirat(AT_FDCWD, pathname, mode);
3633}
3634
3635/*
3636 * The dentry_unhash() helper will try to drop the dentry early: we
3637 * should have a usage count of 1 if we're the only user of this
3638 * dentry, and if that is true (possibly after pruning the dcache),
3639 * then we drop the dentry now.
3640 *
3641 * A low-level filesystem can, if it choses, legally
3642 * do a
3643 *
3644 *	if (!d_unhashed(dentry))
3645 *		return -EBUSY;
3646 *
3647 * if it cannot handle the case of removing a directory
3648 * that is still in use by something else..
3649 */
3650void dentry_unhash(struct dentry *dentry)
3651{
3652	shrink_dcache_parent(dentry);
3653	spin_lock(&dentry->d_lock);
3654	if (dentry->d_lockref.count == 1)
3655		__d_drop(dentry);
3656	spin_unlock(&dentry->d_lock);
3657}
3658EXPORT_SYMBOL(dentry_unhash);
3659
3660int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3661{
3662	int error = may_delete(dir, dentry, 1);
3663
3664	if (error)
3665		return error;
3666
3667	if (!dir->i_op->rmdir)
3668		return -EPERM;
3669
3670	dget(dentry);
3671	mutex_lock(&dentry->d_inode->i_mutex);
3672
3673	error = -EBUSY;
3674	if (is_local_mountpoint(dentry))
3675		goto out;
3676
3677	error = security_inode_rmdir(dir, dentry);
3678	if (error)
3679		goto out;
3680
3681	shrink_dcache_parent(dentry);
3682	error = dir->i_op->rmdir(dir, dentry);
3683	if (error)
3684		goto out;
3685
3686	dentry->d_inode->i_flags |= S_DEAD;
3687	dont_mount(dentry);
3688	detach_mounts(dentry);
3689
3690out:
3691	mutex_unlock(&dentry->d_inode->i_mutex);
3692	dput(dentry);
3693	if (!error)
3694		d_delete(dentry);
3695	return error;
3696}
3697EXPORT_SYMBOL(vfs_rmdir);
3698
3699static long do_rmdir(int dfd, const char __user *pathname)
3700{
3701	int error = 0;
3702	struct filename *name;
3703	struct dentry *dentry;
3704	struct path path;
3705	struct qstr last;
3706	int type;
3707	unsigned int lookup_flags = 0;
3708retry:
3709	name = user_path_parent(dfd, pathname,
3710				&path, &last, &type, lookup_flags);
3711	if (IS_ERR(name))
3712		return PTR_ERR(name);
3713
3714	switch (type) {
3715	case LAST_DOTDOT:
3716		error = -ENOTEMPTY;
3717		goto exit1;
3718	case LAST_DOT:
3719		error = -EINVAL;
3720		goto exit1;
3721	case LAST_ROOT:
3722		error = -EBUSY;
3723		goto exit1;
3724	}
3725
3726	error = mnt_want_write(path.mnt);
3727	if (error)
3728		goto exit1;
3729
3730	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3731	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3732	error = PTR_ERR(dentry);
3733	if (IS_ERR(dentry))
3734		goto exit2;
3735	if (!dentry->d_inode) {
3736		error = -ENOENT;
3737		goto exit3;
3738	}
3739	error = security_path_rmdir(&path, dentry);
3740	if (error)
3741		goto exit3;
3742	error = vfs_rmdir(path.dentry->d_inode, dentry);
3743exit3:
3744	dput(dentry);
3745exit2:
3746	mutex_unlock(&path.dentry->d_inode->i_mutex);
3747	mnt_drop_write(path.mnt);
3748exit1:
3749	path_put(&path);
3750	putname(name);
3751	if (retry_estale(error, lookup_flags)) {
3752		lookup_flags |= LOOKUP_REVAL;
3753		goto retry;
3754	}
3755	return error;
3756}
3757
3758SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3759{
3760	return do_rmdir(AT_FDCWD, pathname);
3761}
3762
3763/**
3764 * vfs_unlink - unlink a filesystem object
3765 * @dir:	parent directory
3766 * @dentry:	victim
3767 * @delegated_inode: returns victim inode, if the inode is delegated.
3768 *
3769 * The caller must hold dir->i_mutex.
3770 *
3771 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
3772 * return a reference to the inode in delegated_inode.  The caller
3773 * should then break the delegation on that inode and retry.  Because
3774 * breaking a delegation may take a long time, the caller should drop
3775 * dir->i_mutex before doing so.
3776 *
3777 * Alternatively, a caller may pass NULL for delegated_inode.  This may
3778 * be appropriate for callers that expect the underlying filesystem not
3779 * to be NFS exported.
3780 */
3781int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
3782{
3783	struct inode *target = dentry->d_inode;
3784	int error = may_delete(dir, dentry, 0);
3785
3786	if (error)
3787		return error;
3788
3789	if (!dir->i_op->unlink)
3790		return -EPERM;
3791
3792	mutex_lock(&target->i_mutex);
3793	if (is_local_mountpoint(dentry))
3794		error = -EBUSY;
3795	else {
3796		error = security_inode_unlink(dir, dentry);
3797		if (!error) {
3798			error = try_break_deleg(target, delegated_inode);
3799			if (error)
3800				goto out;
3801			error = dir->i_op->unlink(dir, dentry);
3802			if (!error) {
3803				dont_mount(dentry);
3804				detach_mounts(dentry);
3805			}
3806		}
3807	}
3808out:
3809	mutex_unlock(&target->i_mutex);
3810
3811	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
3812	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3813		fsnotify_link_count(target);
3814		d_delete(dentry);
3815	}
3816
3817	return error;
3818}
3819EXPORT_SYMBOL(vfs_unlink);
3820
3821/*
3822 * Make sure that the actual truncation of the file will occur outside its
3823 * directory's i_mutex.  Truncate can take a long time if there is a lot of
3824 * writeout happening, and we don't want to prevent access to the directory
3825 * while waiting on the I/O.
3826 */
3827static long do_unlinkat(int dfd, const char __user *pathname)
3828{
3829	int error;
3830	struct filename *name;
3831	struct dentry *dentry;
3832	struct path path;
3833	struct qstr last;
3834	int type;
3835	struct inode *inode = NULL;
3836	struct inode *delegated_inode = NULL;
3837	unsigned int lookup_flags = 0;
3838retry:
3839	name = user_path_parent(dfd, pathname,
3840				&path, &last, &type, lookup_flags);
3841	if (IS_ERR(name))
3842		return PTR_ERR(name);
3843
3844	error = -EISDIR;
3845	if (type != LAST_NORM)
3846		goto exit1;
3847
3848	error = mnt_want_write(path.mnt);
3849	if (error)
3850		goto exit1;
3851retry_deleg:
3852	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3853	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3854	error = PTR_ERR(dentry);
3855	if (!IS_ERR(dentry)) {
3856		/* Why not before? Because we want correct error value */
3857		if (last.name[last.len])
3858			goto slashes;
3859		inode = dentry->d_inode;
3860		if (d_is_negative(dentry))
3861			goto slashes;
3862		ihold(inode);
3863		error = security_path_unlink(&path, dentry);
3864		if (error)
3865			goto exit2;
3866		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
3867exit2:
3868		dput(dentry);
3869	}
3870	mutex_unlock(&path.dentry->d_inode->i_mutex);
3871	if (inode)
3872		iput(inode);	/* truncate the inode here */
3873	inode = NULL;
3874	if (delegated_inode) {
3875		error = break_deleg_wait(&delegated_inode);
3876		if (!error)
3877			goto retry_deleg;
3878	}
3879	mnt_drop_write(path.mnt);
3880exit1:
3881	path_put(&path);
3882	putname(name);
3883	if (retry_estale(error, lookup_flags)) {
3884		lookup_flags |= LOOKUP_REVAL;
3885		inode = NULL;
3886		goto retry;
3887	}
3888	return error;
3889
3890slashes:
3891	if (d_is_negative(dentry))
3892		error = -ENOENT;
3893	else if (d_is_dir(dentry))
3894		error = -EISDIR;
3895	else
3896		error = -ENOTDIR;
3897	goto exit2;
3898}
3899
3900SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3901{
3902	if ((flag & ~AT_REMOVEDIR) != 0)
3903		return -EINVAL;
3904
3905	if (flag & AT_REMOVEDIR)
3906		return do_rmdir(dfd, pathname);
3907
3908	return do_unlinkat(dfd, pathname);
3909}
3910
3911SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3912{
3913	return do_unlinkat(AT_FDCWD, pathname);
3914}
3915
3916int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
3917{
3918	int error = may_create(dir, dentry);
3919
3920	if (error)
3921		return error;
3922
3923	if (!dir->i_op->symlink)
3924		return -EPERM;
3925
3926	error = security_inode_symlink(dir, dentry, oldname);
3927	if (error)
3928		return error;
3929
3930	error = dir->i_op->symlink(dir, dentry, oldname);
3931	if (!error)
3932		fsnotify_create(dir, dentry);
3933	return error;
3934}
3935EXPORT_SYMBOL(vfs_symlink);
3936
3937SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3938		int, newdfd, const char __user *, newname)
3939{
3940	int error;
3941	struct filename *from;
3942	struct dentry *dentry;
3943	struct path path;
3944	unsigned int lookup_flags = 0;
3945
3946	from = getname(oldname);
3947	if (IS_ERR(from))
3948		return PTR_ERR(from);
3949retry:
3950	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3951	error = PTR_ERR(dentry);
3952	if (IS_ERR(dentry))
3953		goto out_putname;
3954
3955	error = security_path_symlink(&path, dentry, from->name);
3956	if (!error)
3957		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3958	done_path_create(&path, dentry);
3959	if (retry_estale(error, lookup_flags)) {
3960		lookup_flags |= LOOKUP_REVAL;
3961		goto retry;
3962	}
3963out_putname:
3964	putname(from);
3965	return error;
3966}
3967
3968SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3969{
3970	return sys_symlinkat(oldname, AT_FDCWD, newname);
3971}
3972
3973/**
3974 * vfs_link - create a new link
3975 * @old_dentry:	object to be linked
3976 * @dir:	new parent
3977 * @new_dentry:	where to create the new link
3978 * @delegated_inode: returns inode needing a delegation break
3979 *
3980 * The caller must hold dir->i_mutex
3981 *
3982 * If vfs_link discovers a delegation on the to-be-linked file in need
3983 * of breaking, it will return -EWOULDBLOCK and return a reference to the
3984 * inode in delegated_inode.  The caller should then break the delegation
3985 * and retry.  Because breaking a delegation may take a long time, the
3986 * caller should drop the i_mutex before doing so.
3987 *
3988 * Alternatively, a caller may pass NULL for delegated_inode.  This may
3989 * be appropriate for callers that expect the underlying filesystem not
3990 * to be NFS exported.
3991 */
3992int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
3993{
3994	struct inode *inode = old_dentry->d_inode;
3995	unsigned max_links = dir->i_sb->s_max_links;
3996	int error;
3997
3998	if (!inode)
3999		return -ENOENT;
4000
4001	error = may_create(dir, new_dentry);
4002	if (error)
4003		return error;
4004
4005	if (dir->i_sb != inode->i_sb)
4006		return -EXDEV;
4007
4008	/*
4009	 * A link to an append-only or immutable file cannot be created.
4010	 */
4011	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4012		return -EPERM;
4013	if (!dir->i_op->link)
4014		return -EPERM;
4015	if (S_ISDIR(inode->i_mode))
4016		return -EPERM;
4017
4018	error = security_inode_link(old_dentry, dir, new_dentry);
4019	if (error)
4020		return error;
4021
4022	mutex_lock(&inode->i_mutex);
4023	/* Make sure we don't allow creating hardlink to an unlinked file */
4024	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4025		error =  -ENOENT;
4026	else if (max_links && inode->i_nlink >= max_links)
4027		error = -EMLINK;
4028	else {
4029		error = try_break_deleg(inode, delegated_inode);
4030		if (!error)
4031			error = dir->i_op->link(old_dentry, dir, new_dentry);
4032	}
4033
4034	if (!error && (inode->i_state & I_LINKABLE)) {
4035		spin_lock(&inode->i_lock);
4036		inode->i_state &= ~I_LINKABLE;
4037		spin_unlock(&inode->i_lock);
4038	}
4039	mutex_unlock(&inode->i_mutex);
4040	if (!error)
4041		fsnotify_link(dir, inode, new_dentry);
4042	return error;
4043}
4044EXPORT_SYMBOL(vfs_link);
4045
4046/*
4047 * Hardlinks are often used in delicate situations.  We avoid
4048 * security-related surprises by not following symlinks on the
4049 * newname.  --KAB
4050 *
4051 * We don't follow them on the oldname either to be compatible
4052 * with linux 2.0, and to avoid hard-linking to directories
4053 * and other special files.  --ADM
4054 */
4055SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4056		int, newdfd, const char __user *, newname, int, flags)
4057{
4058	struct dentry *new_dentry;
4059	struct path old_path, new_path;
4060	struct inode *delegated_inode = NULL;
4061	int how = 0;
4062	int error;
4063
4064	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4065		return -EINVAL;
4066	/*
4067	 * To use null names we require CAP_DAC_READ_SEARCH
4068	 * This ensures that not everyone will be able to create
4069	 * handlink using the passed filedescriptor.
4070	 */
4071	if (flags & AT_EMPTY_PATH) {
4072		if (!capable(CAP_DAC_READ_SEARCH))
4073			return -ENOENT;
4074		how = LOOKUP_EMPTY;
4075	}
4076
4077	if (flags & AT_SYMLINK_FOLLOW)
4078		how |= LOOKUP_FOLLOW;
4079retry:
4080	error = user_path_at(olddfd, oldname, how, &old_path);
4081	if (error)
4082		return error;
4083
4084	new_dentry = user_path_create(newdfd, newname, &new_path,
4085					(how & LOOKUP_REVAL));
4086	error = PTR_ERR(new_dentry);
4087	if (IS_ERR(new_dentry))
4088		goto out;
4089
4090	error = -EXDEV;
4091	if (old_path.mnt != new_path.mnt)
4092		goto out_dput;
4093	error = may_linkat(&old_path);
4094	if (unlikely(error))
4095		goto out_dput;
4096	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4097	if (error)
4098		goto out_dput;
4099	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4100out_dput:
4101	done_path_create(&new_path, new_dentry);
4102	if (delegated_inode) {
4103		error = break_deleg_wait(&delegated_inode);
4104		if (!error) {
4105			path_put(&old_path);
4106			goto retry;
4107		}
4108	}
4109	if (retry_estale(error, how)) {
4110		path_put(&old_path);
4111		how |= LOOKUP_REVAL;
4112		goto retry;
4113	}
4114out:
4115	path_put(&old_path);
4116
4117	return error;
4118}
4119
4120SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4121{
4122	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4123}
4124
4125/**
4126 * vfs_rename - rename a filesystem object
4127 * @old_dir:	parent of source
4128 * @old_dentry:	source
4129 * @new_dir:	parent of destination
4130 * @new_dentry:	destination
4131 * @delegated_inode: returns an inode needing a delegation break
4132 * @flags:	rename flags
4133 *
4134 * The caller must hold multiple mutexes--see lock_rename()).
4135 *
4136 * If vfs_rename discovers a delegation in need of breaking at either
4137 * the source or destination, it will return -EWOULDBLOCK and return a
4138 * reference to the inode in delegated_inode.  The caller should then
4139 * break the delegation and retry.  Because breaking a delegation may
4140 * take a long time, the caller should drop all locks before doing
4141 * so.
4142 *
4143 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4144 * be appropriate for callers that expect the underlying filesystem not
4145 * to be NFS exported.
4146 *
4147 * The worst of all namespace operations - renaming directory. "Perverted"
4148 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4149 * Problems:
4150 *	a) we can get into loop creation.
4151 *	b) race potential - two innocent renames can create a loop together.
4152 *	   That's where 4.4 screws up. Current fix: serialization on
4153 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4154 *	   story.
4155 *	c) we have to lock _four_ objects - parents and victim (if it exists),
4156 *	   and source (if it is not a directory).
4157 *	   And that - after we got ->i_mutex on parents (until then we don't know
4158 *	   whether the target exists).  Solution: try to be smart with locking
4159 *	   order for inodes.  We rely on the fact that tree topology may change
4160 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
4161 *	   move will be locked.  Thus we can rank directories by the tree
4162 *	   (ancestors first) and rank all non-directories after them.
4163 *	   That works since everybody except rename does "lock parent, lookup,
4164 *	   lock child" and rename is under ->s_vfs_rename_mutex.
4165 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
4166 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
4167 *	   we'd better make sure that there's no link(2) for them.
4168 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4169 *	   we are removing the target. Solution: we will have to grab ->i_mutex
4170 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4171 *	   ->i_mutex on parents, which works but leads to some truly excessive
4172 *	   locking].
4173 */
4174int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4175	       struct inode *new_dir, struct dentry *new_dentry,
4176	       struct inode **delegated_inode, unsigned int flags)
4177{
4178	int error;
4179	bool is_dir = d_is_dir(old_dentry);
4180	const unsigned char *old_name;
4181	struct inode *source = old_dentry->d_inode;
4182	struct inode *target = new_dentry->d_inode;
4183	bool new_is_dir = false;
4184	unsigned max_links = new_dir->i_sb->s_max_links;
4185
4186	/*
4187	 * Check source == target.
4188	 * On overlayfs need to look at underlying inodes.
4189	 */
4190	if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
4191		return 0;
4192
4193	error = may_delete(old_dir, old_dentry, is_dir);
4194	if (error)
4195		return error;
4196
4197	if (!target) {
4198		error = may_create(new_dir, new_dentry);
4199	} else {
4200		new_is_dir = d_is_dir(new_dentry);
4201
4202		if (!(flags & RENAME_EXCHANGE))
4203			error = may_delete(new_dir, new_dentry, is_dir);
4204		else
4205			error = may_delete(new_dir, new_dentry, new_is_dir);
4206	}
4207	if (error)
4208		return error;
4209
4210	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4211		return -EPERM;
4212
4213	if (flags && !old_dir->i_op->rename2)
4214		return -EINVAL;
4215
4216	/*
4217	 * If we are going to change the parent - check write permissions,
4218	 * we'll need to flip '..'.
4219	 */
4220	if (new_dir != old_dir) {
4221		if (is_dir) {
4222			error = inode_permission(source, MAY_WRITE);
4223			if (error)
4224				return error;
4225		}
4226		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4227			error = inode_permission(target, MAY_WRITE);
4228			if (error)
4229				return error;
4230		}
4231	}
4232
4233	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4234				      flags);
4235	if (error)
4236		return error;
4237
4238	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4239	dget(new_dentry);
4240	if (!is_dir || (flags & RENAME_EXCHANGE))
4241		lock_two_nondirectories(source, target);
4242	else if (target)
4243		mutex_lock(&target->i_mutex);
4244
4245	error = -EBUSY;
4246	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4247		goto out;
4248
4249	if (max_links && new_dir != old_dir) {
4250		error = -EMLINK;
4251		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4252			goto out;
4253		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4254		    old_dir->i_nlink >= max_links)
4255			goto out;
4256	}
4257	if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4258		shrink_dcache_parent(new_dentry);
4259	if (!is_dir) {
4260		error = try_break_deleg(source, delegated_inode);
4261		if (error)
4262			goto out;
4263	}
4264	if (target && !new_is_dir) {
4265		error = try_break_deleg(target, delegated_inode);
4266		if (error)
4267			goto out;
4268	}
4269	if (!old_dir->i_op->rename2) {
4270		error = old_dir->i_op->rename(old_dir, old_dentry,
4271					      new_dir, new_dentry);
4272	} else {
4273		WARN_ON(old_dir->i_op->rename != NULL);
4274		error = old_dir->i_op->rename2(old_dir, old_dentry,
4275					       new_dir, new_dentry, flags);
4276	}
4277	if (error)
4278		goto out;
4279
4280	if (!(flags & RENAME_EXCHANGE) && target) {
4281		if (is_dir)
4282			target->i_flags |= S_DEAD;
4283		dont_mount(new_dentry);
4284		detach_mounts(new_dentry);
4285	}
4286	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4287		if (!(flags & RENAME_EXCHANGE))
4288			d_move(old_dentry, new_dentry);
4289		else
4290			d_exchange(old_dentry, new_dentry);
4291	}
4292out:
4293	if (!is_dir || (flags & RENAME_EXCHANGE))
4294		unlock_two_nondirectories(source, target);
4295	else if (target)
4296		mutex_unlock(&target->i_mutex);
4297	dput(new_dentry);
4298	if (!error) {
4299		fsnotify_move(old_dir, new_dir, old_name, is_dir,
4300			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4301		if (flags & RENAME_EXCHANGE) {
4302			fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4303				      new_is_dir, NULL, new_dentry);
4304		}
4305	}
4306	fsnotify_oldname_free(old_name);
4307
4308	return error;
4309}
4310EXPORT_SYMBOL(vfs_rename);
4311
4312SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4313		int, newdfd, const char __user *, newname, unsigned int, flags)
4314{
4315	struct dentry *old_dentry, *new_dentry;
4316	struct dentry *trap;
4317	struct path old_path, new_path;
4318	struct qstr old_last, new_last;
4319	int old_type, new_type;
4320	struct inode *delegated_inode = NULL;
4321	struct filename *from;
4322	struct filename *to;
4323	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4324	bool should_retry = false;
4325	int error;
4326
4327	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4328		return -EINVAL;
4329
4330	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4331	    (flags & RENAME_EXCHANGE))
4332		return -EINVAL;
4333
4334	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4335		return -EPERM;
4336
4337	if (flags & RENAME_EXCHANGE)
4338		target_flags = 0;
4339
4340retry:
4341	from = user_path_parent(olddfd, oldname,
4342				&old_path, &old_last, &old_type, lookup_flags);
4343	if (IS_ERR(from)) {
4344		error = PTR_ERR(from);
4345		goto exit;
4346	}
4347
4348	to = user_path_parent(newdfd, newname,
4349				&new_path, &new_last, &new_type, lookup_flags);
4350	if (IS_ERR(to)) {
4351		error = PTR_ERR(to);
4352		goto exit1;
4353	}
4354
4355	error = -EXDEV;
4356	if (old_path.mnt != new_path.mnt)
4357		goto exit2;
4358
4359	error = -EBUSY;
4360	if (old_type != LAST_NORM)
4361		goto exit2;
4362
4363	if (flags & RENAME_NOREPLACE)
4364		error = -EEXIST;
4365	if (new_type != LAST_NORM)
4366		goto exit2;
4367
4368	error = mnt_want_write(old_path.mnt);
4369	if (error)
4370		goto exit2;
4371
4372retry_deleg:
4373	trap = lock_rename(new_path.dentry, old_path.dentry);
4374
4375	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4376	error = PTR_ERR(old_dentry);
4377	if (IS_ERR(old_dentry))
4378		goto exit3;
4379	/* source must exist */
4380	error = -ENOENT;
4381	if (d_is_negative(old_dentry))
4382		goto exit4;
4383	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4384	error = PTR_ERR(new_dentry);
4385	if (IS_ERR(new_dentry))
4386		goto exit4;
4387	error = -EEXIST;
4388	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4389		goto exit5;
4390	if (flags & RENAME_EXCHANGE) {
4391		error = -ENOENT;
4392		if (d_is_negative(new_dentry))
4393			goto exit5;
4394
4395		if (!d_is_dir(new_dentry)) {
4396			error = -ENOTDIR;
4397			if (new_last.name[new_last.len])
4398				goto exit5;
4399		}
4400	}
4401	/* unless the source is a directory trailing slashes give -ENOTDIR */
4402	if (!d_is_dir(old_dentry)) {
4403		error = -ENOTDIR;
4404		if (old_last.name[old_last.len])
4405			goto exit5;
4406		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4407			goto exit5;
4408	}
4409	/* source should not be ancestor of target */
4410	error = -EINVAL;
4411	if (old_dentry == trap)
4412		goto exit5;
4413	/* target should not be an ancestor of source */
4414	if (!(flags & RENAME_EXCHANGE))
4415		error = -ENOTEMPTY;
4416	if (new_dentry == trap)
4417		goto exit5;
4418
4419	error = security_path_rename(&old_path, old_dentry,
4420				     &new_path, new_dentry, flags);
4421	if (error)
4422		goto exit5;
4423	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
4424			   new_path.dentry->d_inode, new_dentry,
4425			   &delegated_inode, flags);
4426exit5:
4427	dput(new_dentry);
4428exit4:
4429	dput(old_dentry);
4430exit3:
4431	unlock_rename(new_path.dentry, old_path.dentry);
4432	if (delegated_inode) {
4433		error = break_deleg_wait(&delegated_inode);
4434		if (!error)
4435			goto retry_deleg;
4436	}
4437	mnt_drop_write(old_path.mnt);
4438exit2:
4439	if (retry_estale(error, lookup_flags))
4440		should_retry = true;
4441	path_put(&new_path);
4442	putname(to);
4443exit1:
4444	path_put(&old_path);
4445	putname(from);
4446	if (should_retry) {
4447		should_retry = false;
4448		lookup_flags |= LOOKUP_REVAL;
4449		goto retry;
4450	}
4451exit:
4452	return error;
4453}
4454
4455SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4456		int, newdfd, const char __user *, newname)
4457{
4458	return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4459}
4460
4461SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4462{
4463	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4464}
4465
4466int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4467{
4468	int error = may_create(dir, dentry);
4469	if (error)
4470		return error;
4471
4472	if (!dir->i_op->mknod)
4473		return -EPERM;
4474
4475	return dir->i_op->mknod(dir, dentry,
4476				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4477}
4478EXPORT_SYMBOL(vfs_whiteout);
4479
4480int readlink_copy(char __user *buffer, int buflen, const char *link)
4481{
4482	int len = PTR_ERR(link);
4483	if (IS_ERR(link))
4484		goto out;
4485
4486	len = strlen(link);
4487	if (len > (unsigned) buflen)
4488		len = buflen;
4489	if (copy_to_user(buffer, link, len))
4490		len = -EFAULT;
4491out:
4492	return len;
4493}
4494EXPORT_SYMBOL(readlink_copy);
4495
4496/*
4497 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
4498 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
4499 * using) it for any given inode is up to filesystem.
4500 */
4501int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4502{
4503	void *cookie;
4504	struct inode *inode = d_inode(dentry);
4505	const char *link = inode->i_link;
4506	int res;
4507
4508	if (!link) {
4509		link = inode->i_op->follow_link(dentry, &cookie);
4510		if (IS_ERR(link))
4511			return PTR_ERR(link);
4512	}
4513	res = readlink_copy(buffer, buflen, link);
4514	if (inode->i_op->put_link)
4515		inode->i_op->put_link(inode, cookie);
4516	return res;
4517}
4518EXPORT_SYMBOL(generic_readlink);
4519
4520/* get the link contents into pagecache */
4521static char *page_getlink(struct dentry * dentry, struct page **ppage)
4522{
4523	char *kaddr;
4524	struct page *page;
4525	struct address_space *mapping = dentry->d_inode->i_mapping;
4526	page = read_mapping_page(mapping, 0, NULL);
4527	if (IS_ERR(page))
4528		return (char*)page;
4529	*ppage = page;
4530	kaddr = kmap(page);
4531	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
4532	return kaddr;
4533}
4534
4535int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4536{
4537	struct page *page = NULL;
4538	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
4539	if (page) {
4540		kunmap(page);
4541		page_cache_release(page);
4542	}
4543	return res;
4544}
4545EXPORT_SYMBOL(page_readlink);
4546
4547const char *page_follow_link_light(struct dentry *dentry, void **cookie)
4548{
4549	struct page *page = NULL;
4550	char *res = page_getlink(dentry, &page);
4551	if (!IS_ERR(res))
4552		*cookie = page;
4553	return res;
4554}
4555EXPORT_SYMBOL(page_follow_link_light);
4556
4557void page_put_link(struct inode *unused, void *cookie)
4558{
4559	struct page *page = cookie;
4560	kunmap(page);
4561	page_cache_release(page);
4562}
4563EXPORT_SYMBOL(page_put_link);
4564
4565/*
4566 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4567 */
4568int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4569{
4570	struct address_space *mapping = inode->i_mapping;
4571	struct page *page;
4572	void *fsdata;
4573	int err;
4574	char *kaddr;
4575	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
4576	if (nofs)
4577		flags |= AOP_FLAG_NOFS;
4578
4579retry:
4580	err = pagecache_write_begin(NULL, mapping, 0, len-1,
4581				flags, &page, &fsdata);
4582	if (err)
4583		goto fail;
4584
4585	kaddr = kmap_atomic(page);
4586	memcpy(kaddr, symname, len-1);
4587	kunmap_atomic(kaddr);
4588
4589	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4590							page, fsdata);
4591	if (err < 0)
4592		goto fail;
4593	if (err < len-1)
4594		goto retry;
4595
4596	mark_inode_dirty(inode);
4597	return 0;
4598fail:
4599	return err;
4600}
4601EXPORT_SYMBOL(__page_symlink);
4602
4603int page_symlink(struct inode *inode, const char *symname, int len)
4604{
4605	return __page_symlink(inode, symname, len,
4606			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4607}
4608EXPORT_SYMBOL(page_symlink);
4609
4610const struct inode_operations page_symlink_inode_operations = {
4611	.readlink	= generic_readlink,
4612	.follow_link	= page_follow_link_light,
4613	.put_link	= page_put_link,
4614};
4615EXPORT_SYMBOL(page_symlink_inode_operations);
4616