1/*
2 *  linux/fs/file.c
3 *
4 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
5 *
6 *  Manage the dynamic fd arrays in the process files_struct.
7 */
8
9#include <linux/syscalls.h>
10#include <linux/export.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/mmzone.h>
14#include <linux/time.h>
15#include <linux/sched.h>
16#include <linux/slab.h>
17#include <linux/vmalloc.h>
18#include <linux/file.h>
19#include <linux/fdtable.h>
20#include <linux/bitops.h>
21#include <linux/interrupt.h>
22#include <linux/spinlock.h>
23#include <linux/rcupdate.h>
24#include <linux/workqueue.h>
25
26int sysctl_nr_open __read_mostly = 1024*1024;
27int sysctl_nr_open_min = BITS_PER_LONG;
28/* our max() is unusable in constant expressions ;-/ */
29#define __const_max(x, y) ((x) < (y) ? (x) : (y))
30int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
31			 -BITS_PER_LONG;
32
33static void *alloc_fdmem(size_t size)
34{
35	/*
36	 * Very large allocations can stress page reclaim, so fall back to
37	 * vmalloc() if the allocation size will be considered "large" by the VM.
38	 */
39	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
40		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
41		if (data != NULL)
42			return data;
43	}
44	return vmalloc(size);
45}
46
47static void __free_fdtable(struct fdtable *fdt)
48{
49	kvfree(fdt->fd);
50	kvfree(fdt->open_fds);
51	kfree(fdt);
52}
53
54static void free_fdtable_rcu(struct rcu_head *rcu)
55{
56	__free_fdtable(container_of(rcu, struct fdtable, rcu));
57}
58
59/*
60 * Expand the fdset in the files_struct.  Called with the files spinlock
61 * held for write.
62 */
63static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
64{
65	unsigned int cpy, set;
66
67	BUG_ON(nfdt->max_fds < ofdt->max_fds);
68
69	cpy = ofdt->max_fds * sizeof(struct file *);
70	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
71	memcpy(nfdt->fd, ofdt->fd, cpy);
72	memset((char *)(nfdt->fd) + cpy, 0, set);
73
74	cpy = ofdt->max_fds / BITS_PER_BYTE;
75	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
76	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
77	memset((char *)(nfdt->open_fds) + cpy, 0, set);
78	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
79	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
80}
81
82static struct fdtable * alloc_fdtable(unsigned int nr)
83{
84	struct fdtable *fdt;
85	void *data;
86
87	/*
88	 * Figure out how many fds we actually want to support in this fdtable.
89	 * Allocation steps are keyed to the size of the fdarray, since it
90	 * grows far faster than any of the other dynamic data. We try to fit
91	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
92	 * and growing in powers of two from there on.
93	 */
94	nr /= (1024 / sizeof(struct file *));
95	nr = roundup_pow_of_two(nr + 1);
96	nr *= (1024 / sizeof(struct file *));
97	/*
98	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
99	 * had been set lower between the check in expand_files() and here.  Deal
100	 * with that in caller, it's cheaper that way.
101	 *
102	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
103	 * bitmaps handling below becomes unpleasant, to put it mildly...
104	 */
105	if (unlikely(nr > sysctl_nr_open))
106		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
107
108	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
109	if (!fdt)
110		goto out;
111	fdt->max_fds = nr;
112	data = alloc_fdmem(nr * sizeof(struct file *));
113	if (!data)
114		goto out_fdt;
115	fdt->fd = data;
116
117	data = alloc_fdmem(max_t(size_t,
118				 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
119	if (!data)
120		goto out_arr;
121	fdt->open_fds = data;
122	data += nr / BITS_PER_BYTE;
123	fdt->close_on_exec = data;
124
125	return fdt;
126
127out_arr:
128	kvfree(fdt->fd);
129out_fdt:
130	kfree(fdt);
131out:
132	return NULL;
133}
134
135/*
136 * Expand the file descriptor table.
137 * This function will allocate a new fdtable and both fd array and fdset, of
138 * the given size.
139 * Return <0 error code on error; 1 on successful completion.
140 * The files->file_lock should be held on entry, and will be held on exit.
141 */
142static int expand_fdtable(struct files_struct *files, int nr)
143	__releases(files->file_lock)
144	__acquires(files->file_lock)
145{
146	struct fdtable *new_fdt, *cur_fdt;
147
148	spin_unlock(&files->file_lock);
149	new_fdt = alloc_fdtable(nr);
150	spin_lock(&files->file_lock);
151	if (!new_fdt)
152		return -ENOMEM;
153	/*
154	 * extremely unlikely race - sysctl_nr_open decreased between the check in
155	 * caller and alloc_fdtable().  Cheaper to catch it here...
156	 */
157	if (unlikely(new_fdt->max_fds <= nr)) {
158		__free_fdtable(new_fdt);
159		return -EMFILE;
160	}
161	/*
162	 * Check again since another task may have expanded the fd table while
163	 * we dropped the lock
164	 */
165	cur_fdt = files_fdtable(files);
166	if (nr >= cur_fdt->max_fds) {
167		/* Continue as planned */
168		copy_fdtable(new_fdt, cur_fdt);
169		rcu_assign_pointer(files->fdt, new_fdt);
170		if (cur_fdt != &files->fdtab)
171			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
172	} else {
173		/* Somebody else expanded, so undo our attempt */
174		__free_fdtable(new_fdt);
175	}
176	return 1;
177}
178
179/*
180 * Expand files.
181 * This function will expand the file structures, if the requested size exceeds
182 * the current capacity and there is room for expansion.
183 * Return <0 error code on error; 0 when nothing done; 1 when files were
184 * expanded and execution may have blocked.
185 * The files->file_lock should be held on entry, and will be held on exit.
186 */
187static int expand_files(struct files_struct *files, int nr)
188{
189	struct fdtable *fdt;
190
191	fdt = files_fdtable(files);
192
193	/* Do we need to expand? */
194	if (nr < fdt->max_fds)
195		return 0;
196
197	/* Can we expand? */
198	if (nr >= sysctl_nr_open)
199		return -EMFILE;
200
201	/* All good, so we try */
202	return expand_fdtable(files, nr);
203}
204
205static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
206{
207	__set_bit(fd, fdt->close_on_exec);
208}
209
210static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
211{
212	__clear_bit(fd, fdt->close_on_exec);
213}
214
215static inline void __set_open_fd(int fd, struct fdtable *fdt)
216{
217	__set_bit(fd, fdt->open_fds);
218}
219
220static inline void __clear_open_fd(int fd, struct fdtable *fdt)
221{
222	__clear_bit(fd, fdt->open_fds);
223}
224
225static int count_open_files(struct fdtable *fdt)
226{
227	int size = fdt->max_fds;
228	int i;
229
230	/* Find the last open fd */
231	for (i = size / BITS_PER_LONG; i > 0; ) {
232		if (fdt->open_fds[--i])
233			break;
234	}
235	i = (i + 1) * BITS_PER_LONG;
236	return i;
237}
238
239/*
240 * Allocate a new files structure and copy contents from the
241 * passed in files structure.
242 * errorp will be valid only when the returned files_struct is NULL.
243 */
244struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
245{
246	struct files_struct *newf;
247	struct file **old_fds, **new_fds;
248	int open_files, size, i;
249	struct fdtable *old_fdt, *new_fdt;
250
251	*errorp = -ENOMEM;
252	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
253	if (!newf)
254		goto out;
255
256	atomic_set(&newf->count, 1);
257
258	spin_lock_init(&newf->file_lock);
259	newf->next_fd = 0;
260	new_fdt = &newf->fdtab;
261	new_fdt->max_fds = NR_OPEN_DEFAULT;
262	new_fdt->close_on_exec = newf->close_on_exec_init;
263	new_fdt->open_fds = newf->open_fds_init;
264	new_fdt->fd = &newf->fd_array[0];
265
266	spin_lock(&oldf->file_lock);
267	old_fdt = files_fdtable(oldf);
268	open_files = count_open_files(old_fdt);
269
270	/*
271	 * Check whether we need to allocate a larger fd array and fd set.
272	 */
273	while (unlikely(open_files > new_fdt->max_fds)) {
274		spin_unlock(&oldf->file_lock);
275
276		if (new_fdt != &newf->fdtab)
277			__free_fdtable(new_fdt);
278
279		new_fdt = alloc_fdtable(open_files - 1);
280		if (!new_fdt) {
281			*errorp = -ENOMEM;
282			goto out_release;
283		}
284
285		/* beyond sysctl_nr_open; nothing to do */
286		if (unlikely(new_fdt->max_fds < open_files)) {
287			__free_fdtable(new_fdt);
288			*errorp = -EMFILE;
289			goto out_release;
290		}
291
292		/*
293		 * Reacquire the oldf lock and a pointer to its fd table
294		 * who knows it may have a new bigger fd table. We need
295		 * the latest pointer.
296		 */
297		spin_lock(&oldf->file_lock);
298		old_fdt = files_fdtable(oldf);
299		open_files = count_open_files(old_fdt);
300	}
301
302	old_fds = old_fdt->fd;
303	new_fds = new_fdt->fd;
304
305	memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
306	memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
307
308	for (i = open_files; i != 0; i--) {
309		struct file *f = *old_fds++;
310		if (f) {
311			get_file(f);
312		} else {
313			/*
314			 * The fd may be claimed in the fd bitmap but not yet
315			 * instantiated in the files array if a sibling thread
316			 * is partway through open().  So make sure that this
317			 * fd is available to the new process.
318			 */
319			__clear_open_fd(open_files - i, new_fdt);
320		}
321		rcu_assign_pointer(*new_fds++, f);
322	}
323	spin_unlock(&oldf->file_lock);
324
325	/* compute the remainder to be cleared */
326	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
327
328	/* This is long word aligned thus could use a optimized version */
329	memset(new_fds, 0, size);
330
331	if (new_fdt->max_fds > open_files) {
332		int left = (new_fdt->max_fds - open_files) / 8;
333		int start = open_files / BITS_PER_LONG;
334
335		memset(&new_fdt->open_fds[start], 0, left);
336		memset(&new_fdt->close_on_exec[start], 0, left);
337	}
338
339	rcu_assign_pointer(newf->fdt, new_fdt);
340
341	return newf;
342
343out_release:
344	kmem_cache_free(files_cachep, newf);
345out:
346	return NULL;
347}
348
349static struct fdtable *close_files(struct files_struct * files)
350{
351	/*
352	 * It is safe to dereference the fd table without RCU or
353	 * ->file_lock because this is the last reference to the
354	 * files structure.
355	 */
356	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
357	int i, j = 0;
358
359	for (;;) {
360		unsigned long set;
361		i = j * BITS_PER_LONG;
362		if (i >= fdt->max_fds)
363			break;
364		set = fdt->open_fds[j++];
365		while (set) {
366			if (set & 1) {
367				struct file * file = xchg(&fdt->fd[i], NULL);
368				if (file) {
369					filp_close(file, files);
370					cond_resched_rcu_qs();
371				}
372			}
373			i++;
374			set >>= 1;
375		}
376	}
377
378	return fdt;
379}
380
381struct files_struct *get_files_struct(struct task_struct *task)
382{
383	struct files_struct *files;
384
385	task_lock(task);
386	files = task->files;
387	if (files)
388		atomic_inc(&files->count);
389	task_unlock(task);
390
391	return files;
392}
393
394void put_files_struct(struct files_struct *files)
395{
396	if (atomic_dec_and_test(&files->count)) {
397		struct fdtable *fdt = close_files(files);
398
399		/* free the arrays if they are not embedded */
400		if (fdt != &files->fdtab)
401			__free_fdtable(fdt);
402		kmem_cache_free(files_cachep, files);
403	}
404}
405
406void reset_files_struct(struct files_struct *files)
407{
408	struct task_struct *tsk = current;
409	struct files_struct *old;
410
411	old = tsk->files;
412	task_lock(tsk);
413	tsk->files = files;
414	task_unlock(tsk);
415	put_files_struct(old);
416}
417
418void exit_files(struct task_struct *tsk)
419{
420	struct files_struct * files = tsk->files;
421
422	if (files) {
423		task_lock(tsk);
424		tsk->files = NULL;
425		task_unlock(tsk);
426		put_files_struct(files);
427	}
428}
429
430struct files_struct init_files = {
431	.count		= ATOMIC_INIT(1),
432	.fdt		= &init_files.fdtab,
433	.fdtab		= {
434		.max_fds	= NR_OPEN_DEFAULT,
435		.fd		= &init_files.fd_array[0],
436		.close_on_exec	= init_files.close_on_exec_init,
437		.open_fds	= init_files.open_fds_init,
438	},
439	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
440};
441
442/*
443 * allocate a file descriptor, mark it busy.
444 */
445int __alloc_fd(struct files_struct *files,
446	       unsigned start, unsigned end, unsigned flags)
447{
448	unsigned int fd;
449	int error;
450	struct fdtable *fdt;
451
452	spin_lock(&files->file_lock);
453repeat:
454	fdt = files_fdtable(files);
455	fd = start;
456	if (fd < files->next_fd)
457		fd = files->next_fd;
458
459	if (fd < fdt->max_fds)
460		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
461
462	/*
463	 * N.B. For clone tasks sharing a files structure, this test
464	 * will limit the total number of files that can be opened.
465	 */
466	error = -EMFILE;
467	if (fd >= end)
468		goto out;
469
470	error = expand_files(files, fd);
471	if (error < 0)
472		goto out;
473
474	/*
475	 * If we needed to expand the fs array we
476	 * might have blocked - try again.
477	 */
478	if (error)
479		goto repeat;
480
481	if (start <= files->next_fd)
482		files->next_fd = fd + 1;
483
484	__set_open_fd(fd, fdt);
485	if (flags & O_CLOEXEC)
486		__set_close_on_exec(fd, fdt);
487	else
488		__clear_close_on_exec(fd, fdt);
489	error = fd;
490#if 1
491	/* Sanity check */
492	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
493		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
494		rcu_assign_pointer(fdt->fd[fd], NULL);
495	}
496#endif
497
498out:
499	spin_unlock(&files->file_lock);
500	return error;
501}
502
503static int alloc_fd(unsigned start, unsigned flags)
504{
505	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
506}
507
508int get_unused_fd_flags(unsigned flags)
509{
510	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
511}
512EXPORT_SYMBOL(get_unused_fd_flags);
513
514static void __put_unused_fd(struct files_struct *files, unsigned int fd)
515{
516	struct fdtable *fdt = files_fdtable(files);
517	__clear_open_fd(fd, fdt);
518	if (fd < files->next_fd)
519		files->next_fd = fd;
520}
521
522void put_unused_fd(unsigned int fd)
523{
524	struct files_struct *files = current->files;
525	spin_lock(&files->file_lock);
526	__put_unused_fd(files, fd);
527	spin_unlock(&files->file_lock);
528}
529
530EXPORT_SYMBOL(put_unused_fd);
531
532/*
533 * Install a file pointer in the fd array.
534 *
535 * The VFS is full of places where we drop the files lock between
536 * setting the open_fds bitmap and installing the file in the file
537 * array.  At any such point, we are vulnerable to a dup2() race
538 * installing a file in the array before us.  We need to detect this and
539 * fput() the struct file we are about to overwrite in this case.
540 *
541 * It should never happen - if we allow dup2() do it, _really_ bad things
542 * will follow.
543 *
544 * NOTE: __fd_install() variant is really, really low-level; don't
545 * use it unless you are forced to by truly lousy API shoved down
546 * your throat.  'files' *MUST* be either current->files or obtained
547 * by get_files_struct(current) done by whoever had given it to you,
548 * or really bad things will happen.  Normally you want to use
549 * fd_install() instead.
550 */
551
552void __fd_install(struct files_struct *files, unsigned int fd,
553		struct file *file)
554{
555	struct fdtable *fdt;
556	spin_lock(&files->file_lock);
557	fdt = files_fdtable(files);
558	BUG_ON(fdt->fd[fd] != NULL);
559	rcu_assign_pointer(fdt->fd[fd], file);
560	spin_unlock(&files->file_lock);
561}
562
563void fd_install(unsigned int fd, struct file *file)
564{
565	__fd_install(current->files, fd, file);
566}
567
568EXPORT_SYMBOL(fd_install);
569
570/*
571 * The same warnings as for __alloc_fd()/__fd_install() apply here...
572 */
573int __close_fd(struct files_struct *files, unsigned fd)
574{
575	struct file *file;
576	struct fdtable *fdt;
577
578	spin_lock(&files->file_lock);
579	fdt = files_fdtable(files);
580	if (fd >= fdt->max_fds)
581		goto out_unlock;
582	file = fdt->fd[fd];
583	if (!file)
584		goto out_unlock;
585	rcu_assign_pointer(fdt->fd[fd], NULL);
586	__clear_close_on_exec(fd, fdt);
587	__put_unused_fd(files, fd);
588	spin_unlock(&files->file_lock);
589	return filp_close(file, files);
590
591out_unlock:
592	spin_unlock(&files->file_lock);
593	return -EBADF;
594}
595
596void do_close_on_exec(struct files_struct *files)
597{
598	unsigned i;
599	struct fdtable *fdt;
600
601	/* exec unshares first */
602	spin_lock(&files->file_lock);
603	for (i = 0; ; i++) {
604		unsigned long set;
605		unsigned fd = i * BITS_PER_LONG;
606		fdt = files_fdtable(files);
607		if (fd >= fdt->max_fds)
608			break;
609		set = fdt->close_on_exec[i];
610		if (!set)
611			continue;
612		fdt->close_on_exec[i] = 0;
613		for ( ; set ; fd++, set >>= 1) {
614			struct file *file;
615			if (!(set & 1))
616				continue;
617			file = fdt->fd[fd];
618			if (!file)
619				continue;
620			rcu_assign_pointer(fdt->fd[fd], NULL);
621			__put_unused_fd(files, fd);
622			spin_unlock(&files->file_lock);
623			filp_close(file, files);
624			cond_resched();
625			spin_lock(&files->file_lock);
626		}
627
628	}
629	spin_unlock(&files->file_lock);
630}
631
632static struct file *__fget(unsigned int fd, fmode_t mask)
633{
634	struct files_struct *files = current->files;
635	struct file *file;
636
637	rcu_read_lock();
638	file = fcheck_files(files, fd);
639	if (file) {
640		/* File object ref couldn't be taken */
641		if ((file->f_mode & mask) || !get_file_rcu(file))
642			file = NULL;
643	}
644	rcu_read_unlock();
645
646	return file;
647}
648
649struct file *fget(unsigned int fd)
650{
651	return __fget(fd, FMODE_PATH);
652}
653EXPORT_SYMBOL(fget);
654
655struct file *fget_raw(unsigned int fd)
656{
657	return __fget(fd, 0);
658}
659EXPORT_SYMBOL(fget_raw);
660
661/*
662 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
663 *
664 * You can use this instead of fget if you satisfy all of the following
665 * conditions:
666 * 1) You must call fput_light before exiting the syscall and returning control
667 *    to userspace (i.e. you cannot remember the returned struct file * after
668 *    returning to userspace).
669 * 2) You must not call filp_close on the returned struct file * in between
670 *    calls to fget_light and fput_light.
671 * 3) You must not clone the current task in between the calls to fget_light
672 *    and fput_light.
673 *
674 * The fput_needed flag returned by fget_light should be passed to the
675 * corresponding fput_light.
676 */
677static unsigned long __fget_light(unsigned int fd, fmode_t mask)
678{
679	struct files_struct *files = current->files;
680	struct file *file;
681
682	if (atomic_read(&files->count) == 1) {
683		file = __fcheck_files(files, fd);
684		if (!file || unlikely(file->f_mode & mask))
685			return 0;
686		return (unsigned long)file;
687	} else {
688		file = __fget(fd, mask);
689		if (!file)
690			return 0;
691		return FDPUT_FPUT | (unsigned long)file;
692	}
693}
694unsigned long __fdget(unsigned int fd)
695{
696	return __fget_light(fd, FMODE_PATH);
697}
698EXPORT_SYMBOL(__fdget);
699
700unsigned long __fdget_raw(unsigned int fd)
701{
702	return __fget_light(fd, 0);
703}
704
705unsigned long __fdget_pos(unsigned int fd)
706{
707	unsigned long v = __fdget(fd);
708	struct file *file = (struct file *)(v & ~3);
709
710	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
711		if (file_count(file) > 1) {
712			v |= FDPUT_POS_UNLOCK;
713			mutex_lock(&file->f_pos_lock);
714		}
715	}
716	return v;
717}
718
719/*
720 * We only lock f_pos if we have threads or if the file might be
721 * shared with another process. In both cases we'll have an elevated
722 * file count (done either by fdget() or by fork()).
723 */
724
725void set_close_on_exec(unsigned int fd, int flag)
726{
727	struct files_struct *files = current->files;
728	struct fdtable *fdt;
729	spin_lock(&files->file_lock);
730	fdt = files_fdtable(files);
731	if (flag)
732		__set_close_on_exec(fd, fdt);
733	else
734		__clear_close_on_exec(fd, fdt);
735	spin_unlock(&files->file_lock);
736}
737
738bool get_close_on_exec(unsigned int fd)
739{
740	struct files_struct *files = current->files;
741	struct fdtable *fdt;
742	bool res;
743	rcu_read_lock();
744	fdt = files_fdtable(files);
745	res = close_on_exec(fd, fdt);
746	rcu_read_unlock();
747	return res;
748}
749
750static int do_dup2(struct files_struct *files,
751	struct file *file, unsigned fd, unsigned flags)
752__releases(&files->file_lock)
753{
754	struct file *tofree;
755	struct fdtable *fdt;
756
757	/*
758	 * We need to detect attempts to do dup2() over allocated but still
759	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
760	 * extra work in their equivalent of fget() - they insert struct
761	 * file immediately after grabbing descriptor, mark it larval if
762	 * more work (e.g. actual opening) is needed and make sure that
763	 * fget() treats larval files as absent.  Potentially interesting,
764	 * but while extra work in fget() is trivial, locking implications
765	 * and amount of surgery on open()-related paths in VFS are not.
766	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
767	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
768	 * scope of POSIX or SUS, since neither considers shared descriptor
769	 * tables and this condition does not arise without those.
770	 */
771	fdt = files_fdtable(files);
772	tofree = fdt->fd[fd];
773	if (!tofree && fd_is_open(fd, fdt))
774		goto Ebusy;
775	get_file(file);
776	rcu_assign_pointer(fdt->fd[fd], file);
777	__set_open_fd(fd, fdt);
778	if (flags & O_CLOEXEC)
779		__set_close_on_exec(fd, fdt);
780	else
781		__clear_close_on_exec(fd, fdt);
782	spin_unlock(&files->file_lock);
783
784	if (tofree)
785		filp_close(tofree, files);
786
787	return fd;
788
789Ebusy:
790	spin_unlock(&files->file_lock);
791	return -EBUSY;
792}
793
794int replace_fd(unsigned fd, struct file *file, unsigned flags)
795{
796	int err;
797	struct files_struct *files = current->files;
798
799	if (!file)
800		return __close_fd(files, fd);
801
802	if (fd >= rlimit(RLIMIT_NOFILE))
803		return -EBADF;
804
805	spin_lock(&files->file_lock);
806	err = expand_files(files, fd);
807	if (unlikely(err < 0))
808		goto out_unlock;
809	return do_dup2(files, file, fd, flags);
810
811out_unlock:
812	spin_unlock(&files->file_lock);
813	return err;
814}
815
816SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
817{
818	int err = -EBADF;
819	struct file *file;
820	struct files_struct *files = current->files;
821
822	if ((flags & ~O_CLOEXEC) != 0)
823		return -EINVAL;
824
825	if (unlikely(oldfd == newfd))
826		return -EINVAL;
827
828	if (newfd >= rlimit(RLIMIT_NOFILE))
829		return -EBADF;
830
831	spin_lock(&files->file_lock);
832	err = expand_files(files, newfd);
833	file = fcheck(oldfd);
834	if (unlikely(!file))
835		goto Ebadf;
836	if (unlikely(err < 0)) {
837		if (err == -EMFILE)
838			goto Ebadf;
839		goto out_unlock;
840	}
841	return do_dup2(files, file, newfd, flags);
842
843Ebadf:
844	err = -EBADF;
845out_unlock:
846	spin_unlock(&files->file_lock);
847	return err;
848}
849
850SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
851{
852	if (unlikely(newfd == oldfd)) { /* corner case */
853		struct files_struct *files = current->files;
854		int retval = oldfd;
855
856		rcu_read_lock();
857		if (!fcheck_files(files, oldfd))
858			retval = -EBADF;
859		rcu_read_unlock();
860		return retval;
861	}
862	return sys_dup3(oldfd, newfd, 0);
863}
864
865SYSCALL_DEFINE1(dup, unsigned int, fildes)
866{
867	int ret = -EBADF;
868	struct file *file = fget_raw(fildes);
869
870	if (file) {
871		ret = get_unused_fd_flags(0);
872		if (ret >= 0)
873			fd_install(ret, file);
874		else
875			fput(file);
876	}
877	return ret;
878}
879
880int f_dupfd(unsigned int from, struct file *file, unsigned flags)
881{
882	int err;
883	if (from >= rlimit(RLIMIT_NOFILE))
884		return -EINVAL;
885	err = alloc_fd(from, flags);
886	if (err >= 0) {
887		get_file(file);
888		fd_install(err, file);
889	}
890	return err;
891}
892
893int iterate_fd(struct files_struct *files, unsigned n,
894		int (*f)(const void *, struct file *, unsigned),
895		const void *p)
896{
897	struct fdtable *fdt;
898	int res = 0;
899	if (!files)
900		return 0;
901	spin_lock(&files->file_lock);
902	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
903		struct file *file;
904		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
905		if (!file)
906			continue;
907		res = f(p, file, n);
908		if (res)
909			break;
910	}
911	spin_unlock(&files->file_lock);
912	return res;
913}
914EXPORT_SYMBOL(iterate_fd);
915