1/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/splice.h>
24#include <linux/memcontrol.h>
25#include <linux/mm_inline.h>
26#include <linux/swap.h>
27#include <linux/writeback.h>
28#include <linux/export.h>
29#include <linux/syscalls.h>
30#include <linux/uio.h>
31#include <linux/security.h>
32#include <linux/gfp.h>
33#include <linux/socket.h>
34#include <linux/compat.h>
35#include "internal.h"
36
37/*
38 * Attempt to steal a page from a pipe buffer. This should perhaps go into
39 * a vm helper function, it's already simplified quite a bit by the
40 * addition of remove_mapping(). If success is returned, the caller may
41 * attempt to reuse this page for another destination.
42 */
43static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
44				     struct pipe_buffer *buf)
45{
46	struct page *page = buf->page;
47	struct address_space *mapping;
48
49	lock_page(page);
50
51	mapping = page_mapping(page);
52	if (mapping) {
53		WARN_ON(!PageUptodate(page));
54
55		/*
56		 * At least for ext2 with nobh option, we need to wait on
57		 * writeback completing on this page, since we'll remove it
58		 * from the pagecache.  Otherwise truncate wont wait on the
59		 * page, allowing the disk blocks to be reused by someone else
60		 * before we actually wrote our data to them. fs corruption
61		 * ensues.
62		 */
63		wait_on_page_writeback(page);
64
65		if (page_has_private(page) &&
66		    !try_to_release_page(page, GFP_KERNEL))
67			goto out_unlock;
68
69		/*
70		 * If we succeeded in removing the mapping, set LRU flag
71		 * and return good.
72		 */
73		if (remove_mapping(mapping, page)) {
74			buf->flags |= PIPE_BUF_FLAG_LRU;
75			return 0;
76		}
77	}
78
79	/*
80	 * Raced with truncate or failed to remove page from current
81	 * address space, unlock and return failure.
82	 */
83out_unlock:
84	unlock_page(page);
85	return 1;
86}
87
88static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
89					struct pipe_buffer *buf)
90{
91	page_cache_release(buf->page);
92	buf->flags &= ~PIPE_BUF_FLAG_LRU;
93}
94
95/*
96 * Check whether the contents of buf is OK to access. Since the content
97 * is a page cache page, IO may be in flight.
98 */
99static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
100				       struct pipe_buffer *buf)
101{
102	struct page *page = buf->page;
103	int err;
104
105	if (!PageUptodate(page)) {
106		lock_page(page);
107
108		/*
109		 * Page got truncated/unhashed. This will cause a 0-byte
110		 * splice, if this is the first page.
111		 */
112		if (!page->mapping) {
113			err = -ENODATA;
114			goto error;
115		}
116
117		/*
118		 * Uh oh, read-error from disk.
119		 */
120		if (!PageUptodate(page)) {
121			err = -EIO;
122			goto error;
123		}
124
125		/*
126		 * Page is ok afterall, we are done.
127		 */
128		unlock_page(page);
129	}
130
131	return 0;
132error:
133	unlock_page(page);
134	return err;
135}
136
137const struct pipe_buf_operations page_cache_pipe_buf_ops = {
138	.can_merge = 0,
139	.confirm = page_cache_pipe_buf_confirm,
140	.release = page_cache_pipe_buf_release,
141	.steal = page_cache_pipe_buf_steal,
142	.get = generic_pipe_buf_get,
143};
144
145static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
146				    struct pipe_buffer *buf)
147{
148	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
149		return 1;
150
151	buf->flags |= PIPE_BUF_FLAG_LRU;
152	return generic_pipe_buf_steal(pipe, buf);
153}
154
155static const struct pipe_buf_operations user_page_pipe_buf_ops = {
156	.can_merge = 0,
157	.confirm = generic_pipe_buf_confirm,
158	.release = page_cache_pipe_buf_release,
159	.steal = user_page_pipe_buf_steal,
160	.get = generic_pipe_buf_get,
161};
162
163static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
164{
165	smp_mb();
166	if (waitqueue_active(&pipe->wait))
167		wake_up_interruptible(&pipe->wait);
168	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
169}
170
171/**
172 * splice_to_pipe - fill passed data into a pipe
173 * @pipe:	pipe to fill
174 * @spd:	data to fill
175 *
176 * Description:
177 *    @spd contains a map of pages and len/offset tuples, along with
178 *    the struct pipe_buf_operations associated with these pages. This
179 *    function will link that data to the pipe.
180 *
181 */
182ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
183		       struct splice_pipe_desc *spd)
184{
185	unsigned int spd_pages = spd->nr_pages;
186	int ret, do_wakeup, page_nr;
187
188	if (!spd_pages)
189		return 0;
190
191	ret = 0;
192	do_wakeup = 0;
193	page_nr = 0;
194
195	pipe_lock(pipe);
196
197	for (;;) {
198		if (!pipe->readers) {
199			send_sig(SIGPIPE, current, 0);
200			if (!ret)
201				ret = -EPIPE;
202			break;
203		}
204
205		if (pipe->nrbufs < pipe->buffers) {
206			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
207			struct pipe_buffer *buf = pipe->bufs + newbuf;
208
209			buf->page = spd->pages[page_nr];
210			buf->offset = spd->partial[page_nr].offset;
211			buf->len = spd->partial[page_nr].len;
212			buf->private = spd->partial[page_nr].private;
213			buf->ops = spd->ops;
214			if (spd->flags & SPLICE_F_GIFT)
215				buf->flags |= PIPE_BUF_FLAG_GIFT;
216
217			pipe->nrbufs++;
218			page_nr++;
219			ret += buf->len;
220
221			if (pipe->files)
222				do_wakeup = 1;
223
224			if (!--spd->nr_pages)
225				break;
226			if (pipe->nrbufs < pipe->buffers)
227				continue;
228
229			break;
230		}
231
232		if (spd->flags & SPLICE_F_NONBLOCK) {
233			if (!ret)
234				ret = -EAGAIN;
235			break;
236		}
237
238		if (signal_pending(current)) {
239			if (!ret)
240				ret = -ERESTARTSYS;
241			break;
242		}
243
244		if (do_wakeup) {
245			smp_mb();
246			if (waitqueue_active(&pipe->wait))
247				wake_up_interruptible_sync(&pipe->wait);
248			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
249			do_wakeup = 0;
250		}
251
252		pipe->waiting_writers++;
253		pipe_wait(pipe);
254		pipe->waiting_writers--;
255	}
256
257	pipe_unlock(pipe);
258
259	if (do_wakeup)
260		wakeup_pipe_readers(pipe);
261
262	while (page_nr < spd_pages)
263		spd->spd_release(spd, page_nr++);
264
265	return ret;
266}
267
268void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
269{
270	page_cache_release(spd->pages[i]);
271}
272
273/*
274 * Check if we need to grow the arrays holding pages and partial page
275 * descriptions.
276 */
277int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
278{
279	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
280
281	spd->nr_pages_max = buffers;
282	if (buffers <= PIPE_DEF_BUFFERS)
283		return 0;
284
285	spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
286	spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
287
288	if (spd->pages && spd->partial)
289		return 0;
290
291	kfree(spd->pages);
292	kfree(spd->partial);
293	return -ENOMEM;
294}
295
296void splice_shrink_spd(struct splice_pipe_desc *spd)
297{
298	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
299		return;
300
301	kfree(spd->pages);
302	kfree(spd->partial);
303}
304
305static int
306__generic_file_splice_read(struct file *in, loff_t *ppos,
307			   struct pipe_inode_info *pipe, size_t len,
308			   unsigned int flags)
309{
310	struct address_space *mapping = in->f_mapping;
311	unsigned int loff, nr_pages, req_pages;
312	struct page *pages[PIPE_DEF_BUFFERS];
313	struct partial_page partial[PIPE_DEF_BUFFERS];
314	struct page *page;
315	pgoff_t index, end_index;
316	loff_t isize;
317	int error, page_nr;
318	struct splice_pipe_desc spd = {
319		.pages = pages,
320		.partial = partial,
321		.nr_pages_max = PIPE_DEF_BUFFERS,
322		.flags = flags,
323		.ops = &page_cache_pipe_buf_ops,
324		.spd_release = spd_release_page,
325	};
326
327	if (splice_grow_spd(pipe, &spd))
328		return -ENOMEM;
329
330	index = *ppos >> PAGE_CACHE_SHIFT;
331	loff = *ppos & ~PAGE_CACHE_MASK;
332	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
333	nr_pages = min(req_pages, spd.nr_pages_max);
334
335	/*
336	 * Lookup the (hopefully) full range of pages we need.
337	 */
338	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
339	index += spd.nr_pages;
340
341	/*
342	 * If find_get_pages_contig() returned fewer pages than we needed,
343	 * readahead/allocate the rest and fill in the holes.
344	 */
345	if (spd.nr_pages < nr_pages)
346		page_cache_sync_readahead(mapping, &in->f_ra, in,
347				index, req_pages - spd.nr_pages);
348
349	error = 0;
350	while (spd.nr_pages < nr_pages) {
351		/*
352		 * Page could be there, find_get_pages_contig() breaks on
353		 * the first hole.
354		 */
355		page = find_get_page(mapping, index);
356		if (!page) {
357			/*
358			 * page didn't exist, allocate one.
359			 */
360			page = page_cache_alloc_cold(mapping);
361			if (!page)
362				break;
363
364			error = add_to_page_cache_lru(page, mapping, index,
365						GFP_KERNEL);
366			if (unlikely(error)) {
367				page_cache_release(page);
368				if (error == -EEXIST)
369					continue;
370				break;
371			}
372			/*
373			 * add_to_page_cache() locks the page, unlock it
374			 * to avoid convoluting the logic below even more.
375			 */
376			unlock_page(page);
377		}
378
379		spd.pages[spd.nr_pages++] = page;
380		index++;
381	}
382
383	/*
384	 * Now loop over the map and see if we need to start IO on any
385	 * pages, fill in the partial map, etc.
386	 */
387	index = *ppos >> PAGE_CACHE_SHIFT;
388	nr_pages = spd.nr_pages;
389	spd.nr_pages = 0;
390	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
391		unsigned int this_len;
392
393		if (!len)
394			break;
395
396		/*
397		 * this_len is the max we'll use from this page
398		 */
399		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
400		page = spd.pages[page_nr];
401
402		if (PageReadahead(page))
403			page_cache_async_readahead(mapping, &in->f_ra, in,
404					page, index, req_pages - page_nr);
405
406		/*
407		 * If the page isn't uptodate, we may need to start io on it
408		 */
409		if (!PageUptodate(page)) {
410			lock_page(page);
411
412			/*
413			 * Page was truncated, or invalidated by the
414			 * filesystem.  Redo the find/create, but this time the
415			 * page is kept locked, so there's no chance of another
416			 * race with truncate/invalidate.
417			 */
418			if (!page->mapping) {
419				unlock_page(page);
420				page = find_or_create_page(mapping, index,
421						mapping_gfp_mask(mapping));
422
423				if (!page) {
424					error = -ENOMEM;
425					break;
426				}
427				page_cache_release(spd.pages[page_nr]);
428				spd.pages[page_nr] = page;
429			}
430			/*
431			 * page was already under io and is now done, great
432			 */
433			if (PageUptodate(page)) {
434				unlock_page(page);
435				goto fill_it;
436			}
437
438			/*
439			 * need to read in the page
440			 */
441			error = mapping->a_ops->readpage(in, page);
442			if (unlikely(error)) {
443				/*
444				 * We really should re-lookup the page here,
445				 * but it complicates things a lot. Instead
446				 * lets just do what we already stored, and
447				 * we'll get it the next time we are called.
448				 */
449				if (error == AOP_TRUNCATED_PAGE)
450					error = 0;
451
452				break;
453			}
454		}
455fill_it:
456		/*
457		 * i_size must be checked after PageUptodate.
458		 */
459		isize = i_size_read(mapping->host);
460		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
461		if (unlikely(!isize || index > end_index))
462			break;
463
464		/*
465		 * if this is the last page, see if we need to shrink
466		 * the length and stop
467		 */
468		if (end_index == index) {
469			unsigned int plen;
470
471			/*
472			 * max good bytes in this page
473			 */
474			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
475			if (plen <= loff)
476				break;
477
478			/*
479			 * force quit after adding this page
480			 */
481			this_len = min(this_len, plen - loff);
482			len = this_len;
483		}
484
485		spd.partial[page_nr].offset = loff;
486		spd.partial[page_nr].len = this_len;
487		len -= this_len;
488		loff = 0;
489		spd.nr_pages++;
490		index++;
491	}
492
493	/*
494	 * Release any pages at the end, if we quit early. 'page_nr' is how far
495	 * we got, 'nr_pages' is how many pages are in the map.
496	 */
497	while (page_nr < nr_pages)
498		page_cache_release(spd.pages[page_nr++]);
499	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
500
501	if (spd.nr_pages)
502		error = splice_to_pipe(pipe, &spd);
503
504	splice_shrink_spd(&spd);
505	return error;
506}
507
508/**
509 * generic_file_splice_read - splice data from file to a pipe
510 * @in:		file to splice from
511 * @ppos:	position in @in
512 * @pipe:	pipe to splice to
513 * @len:	number of bytes to splice
514 * @flags:	splice modifier flags
515 *
516 * Description:
517 *    Will read pages from given file and fill them into a pipe. Can be
518 *    used as long as the address_space operations for the source implements
519 *    a readpage() hook.
520 *
521 */
522ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
523				 struct pipe_inode_info *pipe, size_t len,
524				 unsigned int flags)
525{
526	loff_t isize, left;
527	int ret;
528
529	if (IS_DAX(in->f_mapping->host))
530		return default_file_splice_read(in, ppos, pipe, len, flags);
531
532	isize = i_size_read(in->f_mapping->host);
533	if (unlikely(*ppos >= isize))
534		return 0;
535
536	left = isize - *ppos;
537	if (unlikely(left < len))
538		len = left;
539
540	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
541	if (ret > 0) {
542		*ppos += ret;
543		file_accessed(in);
544	}
545
546	return ret;
547}
548EXPORT_SYMBOL(generic_file_splice_read);
549
550static const struct pipe_buf_operations default_pipe_buf_ops = {
551	.can_merge = 0,
552	.confirm = generic_pipe_buf_confirm,
553	.release = generic_pipe_buf_release,
554	.steal = generic_pipe_buf_steal,
555	.get = generic_pipe_buf_get,
556};
557
558static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
559				    struct pipe_buffer *buf)
560{
561	return 1;
562}
563
564/* Pipe buffer operations for a socket and similar. */
565const struct pipe_buf_operations nosteal_pipe_buf_ops = {
566	.can_merge = 0,
567	.confirm = generic_pipe_buf_confirm,
568	.release = generic_pipe_buf_release,
569	.steal = generic_pipe_buf_nosteal,
570	.get = generic_pipe_buf_get,
571};
572EXPORT_SYMBOL(nosteal_pipe_buf_ops);
573
574static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
575			    unsigned long vlen, loff_t offset)
576{
577	mm_segment_t old_fs;
578	loff_t pos = offset;
579	ssize_t res;
580
581	old_fs = get_fs();
582	set_fs(get_ds());
583	/* The cast to a user pointer is valid due to the set_fs() */
584	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
585	set_fs(old_fs);
586
587	return res;
588}
589
590ssize_t kernel_write(struct file *file, const char *buf, size_t count,
591			    loff_t pos)
592{
593	mm_segment_t old_fs;
594	ssize_t res;
595
596	old_fs = get_fs();
597	set_fs(get_ds());
598	/* The cast to a user pointer is valid due to the set_fs() */
599	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
600	set_fs(old_fs);
601
602	return res;
603}
604EXPORT_SYMBOL(kernel_write);
605
606ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
607				 struct pipe_inode_info *pipe, size_t len,
608				 unsigned int flags)
609{
610	unsigned int nr_pages;
611	unsigned int nr_freed;
612	size_t offset;
613	struct page *pages[PIPE_DEF_BUFFERS];
614	struct partial_page partial[PIPE_DEF_BUFFERS];
615	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
616	ssize_t res;
617	size_t this_len;
618	int error;
619	int i;
620	struct splice_pipe_desc spd = {
621		.pages = pages,
622		.partial = partial,
623		.nr_pages_max = PIPE_DEF_BUFFERS,
624		.flags = flags,
625		.ops = &default_pipe_buf_ops,
626		.spd_release = spd_release_page,
627	};
628
629	if (splice_grow_spd(pipe, &spd))
630		return -ENOMEM;
631
632	res = -ENOMEM;
633	vec = __vec;
634	if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
635		vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
636		if (!vec)
637			goto shrink_ret;
638	}
639
640	offset = *ppos & ~PAGE_CACHE_MASK;
641	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
642
643	for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
644		struct page *page;
645
646		page = alloc_page(GFP_USER);
647		error = -ENOMEM;
648		if (!page)
649			goto err;
650
651		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
652		vec[i].iov_base = (void __user *) page_address(page);
653		vec[i].iov_len = this_len;
654		spd.pages[i] = page;
655		spd.nr_pages++;
656		len -= this_len;
657		offset = 0;
658	}
659
660	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
661	if (res < 0) {
662		error = res;
663		goto err;
664	}
665
666	error = 0;
667	if (!res)
668		goto err;
669
670	nr_freed = 0;
671	for (i = 0; i < spd.nr_pages; i++) {
672		this_len = min_t(size_t, vec[i].iov_len, res);
673		spd.partial[i].offset = 0;
674		spd.partial[i].len = this_len;
675		if (!this_len) {
676			__free_page(spd.pages[i]);
677			spd.pages[i] = NULL;
678			nr_freed++;
679		}
680		res -= this_len;
681	}
682	spd.nr_pages -= nr_freed;
683
684	res = splice_to_pipe(pipe, &spd);
685	if (res > 0)
686		*ppos += res;
687
688shrink_ret:
689	if (vec != __vec)
690		kfree(vec);
691	splice_shrink_spd(&spd);
692	return res;
693
694err:
695	for (i = 0; i < spd.nr_pages; i++)
696		__free_page(spd.pages[i]);
697
698	res = error;
699	goto shrink_ret;
700}
701EXPORT_SYMBOL(default_file_splice_read);
702
703/*
704 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
705 * using sendpage(). Return the number of bytes sent.
706 */
707static int pipe_to_sendpage(struct pipe_inode_info *pipe,
708			    struct pipe_buffer *buf, struct splice_desc *sd)
709{
710	struct file *file = sd->u.file;
711	loff_t pos = sd->pos;
712	int more;
713
714	if (!likely(file->f_op->sendpage))
715		return -EINVAL;
716
717	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
718
719	if (sd->len < sd->total_len && pipe->nrbufs > 1)
720		more |= MSG_SENDPAGE_NOTLAST;
721
722	return file->f_op->sendpage(file, buf->page, buf->offset,
723				    sd->len, &pos, more);
724}
725
726static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
727{
728	smp_mb();
729	if (waitqueue_active(&pipe->wait))
730		wake_up_interruptible(&pipe->wait);
731	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
732}
733
734/**
735 * splice_from_pipe_feed - feed available data from a pipe to a file
736 * @pipe:	pipe to splice from
737 * @sd:		information to @actor
738 * @actor:	handler that splices the data
739 *
740 * Description:
741 *    This function loops over the pipe and calls @actor to do the
742 *    actual moving of a single struct pipe_buffer to the desired
743 *    destination.  It returns when there's no more buffers left in
744 *    the pipe or if the requested number of bytes (@sd->total_len)
745 *    have been copied.  It returns a positive number (one) if the
746 *    pipe needs to be filled with more data, zero if the required
747 *    number of bytes have been copied and -errno on error.
748 *
749 *    This, together with splice_from_pipe_{begin,end,next}, may be
750 *    used to implement the functionality of __splice_from_pipe() when
751 *    locking is required around copying the pipe buffers to the
752 *    destination.
753 */
754static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
755			  splice_actor *actor)
756{
757	int ret;
758
759	while (pipe->nrbufs) {
760		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
761		const struct pipe_buf_operations *ops = buf->ops;
762
763		sd->len = buf->len;
764		if (sd->len > sd->total_len)
765			sd->len = sd->total_len;
766
767		ret = buf->ops->confirm(pipe, buf);
768		if (unlikely(ret)) {
769			if (ret == -ENODATA)
770				ret = 0;
771			return ret;
772		}
773
774		ret = actor(pipe, buf, sd);
775		if (ret <= 0)
776			return ret;
777
778		buf->offset += ret;
779		buf->len -= ret;
780
781		sd->num_spliced += ret;
782		sd->len -= ret;
783		sd->pos += ret;
784		sd->total_len -= ret;
785
786		if (!buf->len) {
787			buf->ops = NULL;
788			ops->release(pipe, buf);
789			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
790			pipe->nrbufs--;
791			if (pipe->files)
792				sd->need_wakeup = true;
793		}
794
795		if (!sd->total_len)
796			return 0;
797	}
798
799	return 1;
800}
801
802/**
803 * splice_from_pipe_next - wait for some data to splice from
804 * @pipe:	pipe to splice from
805 * @sd:		information about the splice operation
806 *
807 * Description:
808 *    This function will wait for some data and return a positive
809 *    value (one) if pipe buffers are available.  It will return zero
810 *    or -errno if no more data needs to be spliced.
811 */
812static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
813{
814	while (!pipe->nrbufs) {
815		if (!pipe->writers)
816			return 0;
817
818		if (!pipe->waiting_writers && sd->num_spliced)
819			return 0;
820
821		if (sd->flags & SPLICE_F_NONBLOCK)
822			return -EAGAIN;
823
824		if (signal_pending(current))
825			return -ERESTARTSYS;
826
827		if (sd->need_wakeup) {
828			wakeup_pipe_writers(pipe);
829			sd->need_wakeup = false;
830		}
831
832		pipe_wait(pipe);
833	}
834
835	return 1;
836}
837
838/**
839 * splice_from_pipe_begin - start splicing from pipe
840 * @sd:		information about the splice operation
841 *
842 * Description:
843 *    This function should be called before a loop containing
844 *    splice_from_pipe_next() and splice_from_pipe_feed() to
845 *    initialize the necessary fields of @sd.
846 */
847static void splice_from_pipe_begin(struct splice_desc *sd)
848{
849	sd->num_spliced = 0;
850	sd->need_wakeup = false;
851}
852
853/**
854 * splice_from_pipe_end - finish splicing from pipe
855 * @pipe:	pipe to splice from
856 * @sd:		information about the splice operation
857 *
858 * Description:
859 *    This function will wake up pipe writers if necessary.  It should
860 *    be called after a loop containing splice_from_pipe_next() and
861 *    splice_from_pipe_feed().
862 */
863static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
864{
865	if (sd->need_wakeup)
866		wakeup_pipe_writers(pipe);
867}
868
869/**
870 * __splice_from_pipe - splice data from a pipe to given actor
871 * @pipe:	pipe to splice from
872 * @sd:		information to @actor
873 * @actor:	handler that splices the data
874 *
875 * Description:
876 *    This function does little more than loop over the pipe and call
877 *    @actor to do the actual moving of a single struct pipe_buffer to
878 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
879 *    pipe_to_user.
880 *
881 */
882ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
883			   splice_actor *actor)
884{
885	int ret;
886
887	splice_from_pipe_begin(sd);
888	do {
889		ret = splice_from_pipe_next(pipe, sd);
890		if (ret > 0)
891			ret = splice_from_pipe_feed(pipe, sd, actor);
892	} while (ret > 0);
893	splice_from_pipe_end(pipe, sd);
894
895	return sd->num_spliced ? sd->num_spliced : ret;
896}
897EXPORT_SYMBOL(__splice_from_pipe);
898
899/**
900 * splice_from_pipe - splice data from a pipe to a file
901 * @pipe:	pipe to splice from
902 * @out:	file to splice to
903 * @ppos:	position in @out
904 * @len:	how many bytes to splice
905 * @flags:	splice modifier flags
906 * @actor:	handler that splices the data
907 *
908 * Description:
909 *    See __splice_from_pipe. This function locks the pipe inode,
910 *    otherwise it's identical to __splice_from_pipe().
911 *
912 */
913ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
914			 loff_t *ppos, size_t len, unsigned int flags,
915			 splice_actor *actor)
916{
917	ssize_t ret;
918	struct splice_desc sd = {
919		.total_len = len,
920		.flags = flags,
921		.pos = *ppos,
922		.u.file = out,
923	};
924
925	pipe_lock(pipe);
926	ret = __splice_from_pipe(pipe, &sd, actor);
927	pipe_unlock(pipe);
928
929	return ret;
930}
931
932/**
933 * iter_file_splice_write - splice data from a pipe to a file
934 * @pipe:	pipe info
935 * @out:	file to write to
936 * @ppos:	position in @out
937 * @len:	number of bytes to splice
938 * @flags:	splice modifier flags
939 *
940 * Description:
941 *    Will either move or copy pages (determined by @flags options) from
942 *    the given pipe inode to the given file.
943 *    This one is ->write_iter-based.
944 *
945 */
946ssize_t
947iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
948			  loff_t *ppos, size_t len, unsigned int flags)
949{
950	struct splice_desc sd = {
951		.total_len = len,
952		.flags = flags,
953		.pos = *ppos,
954		.u.file = out,
955	};
956	int nbufs = pipe->buffers;
957	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
958					GFP_KERNEL);
959	ssize_t ret;
960
961	if (unlikely(!array))
962		return -ENOMEM;
963
964	pipe_lock(pipe);
965
966	splice_from_pipe_begin(&sd);
967	while (sd.total_len) {
968		struct iov_iter from;
969		size_t left;
970		int n, idx;
971
972		ret = splice_from_pipe_next(pipe, &sd);
973		if (ret <= 0)
974			break;
975
976		if (unlikely(nbufs < pipe->buffers)) {
977			kfree(array);
978			nbufs = pipe->buffers;
979			array = kcalloc(nbufs, sizeof(struct bio_vec),
980					GFP_KERNEL);
981			if (!array) {
982				ret = -ENOMEM;
983				break;
984			}
985		}
986
987		/* build the vector */
988		left = sd.total_len;
989		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
990			struct pipe_buffer *buf = pipe->bufs + idx;
991			size_t this_len = buf->len;
992
993			if (this_len > left)
994				this_len = left;
995
996			if (idx == pipe->buffers - 1)
997				idx = -1;
998
999			ret = buf->ops->confirm(pipe, buf);
1000			if (unlikely(ret)) {
1001				if (ret == -ENODATA)
1002					ret = 0;
1003				goto done;
1004			}
1005
1006			array[n].bv_page = buf->page;
1007			array[n].bv_len = this_len;
1008			array[n].bv_offset = buf->offset;
1009			left -= this_len;
1010		}
1011
1012		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
1013			      sd.total_len - left);
1014		ret = vfs_iter_write(out, &from, &sd.pos);
1015		if (ret <= 0)
1016			break;
1017
1018		sd.num_spliced += ret;
1019		sd.total_len -= ret;
1020		*ppos = sd.pos;
1021
1022		/* dismiss the fully eaten buffers, adjust the partial one */
1023		while (ret) {
1024			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
1025			if (ret >= buf->len) {
1026				const struct pipe_buf_operations *ops = buf->ops;
1027				ret -= buf->len;
1028				buf->len = 0;
1029				buf->ops = NULL;
1030				ops->release(pipe, buf);
1031				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1032				pipe->nrbufs--;
1033				if (pipe->files)
1034					sd.need_wakeup = true;
1035			} else {
1036				buf->offset += ret;
1037				buf->len -= ret;
1038				ret = 0;
1039			}
1040		}
1041	}
1042done:
1043	kfree(array);
1044	splice_from_pipe_end(pipe, &sd);
1045
1046	pipe_unlock(pipe);
1047
1048	if (sd.num_spliced)
1049		ret = sd.num_spliced;
1050
1051	return ret;
1052}
1053
1054EXPORT_SYMBOL(iter_file_splice_write);
1055
1056static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1057			  struct splice_desc *sd)
1058{
1059	int ret;
1060	void *data;
1061	loff_t tmp = sd->pos;
1062
1063	data = kmap(buf->page);
1064	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
1065	kunmap(buf->page);
1066
1067	return ret;
1068}
1069
1070static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1071					 struct file *out, loff_t *ppos,
1072					 size_t len, unsigned int flags)
1073{
1074	ssize_t ret;
1075
1076	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1077	if (ret > 0)
1078		*ppos += ret;
1079
1080	return ret;
1081}
1082
1083/**
1084 * generic_splice_sendpage - splice data from a pipe to a socket
1085 * @pipe:	pipe to splice from
1086 * @out:	socket to write to
1087 * @ppos:	position in @out
1088 * @len:	number of bytes to splice
1089 * @flags:	splice modifier flags
1090 *
1091 * Description:
1092 *    Will send @len bytes from the pipe to a network socket. No data copying
1093 *    is involved.
1094 *
1095 */
1096ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1097				loff_t *ppos, size_t len, unsigned int flags)
1098{
1099	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1100}
1101
1102EXPORT_SYMBOL(generic_splice_sendpage);
1103
1104/*
1105 * Attempt to initiate a splice from pipe to file.
1106 */
1107static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1108			   loff_t *ppos, size_t len, unsigned int flags)
1109{
1110	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1111				loff_t *, size_t, unsigned int);
1112
1113	if (out->f_op->splice_write)
1114		splice_write = out->f_op->splice_write;
1115	else
1116		splice_write = default_file_splice_write;
1117
1118	return splice_write(pipe, out, ppos, len, flags);
1119}
1120
1121/*
1122 * Attempt to initiate a splice from a file to a pipe.
1123 */
1124static long do_splice_to(struct file *in, loff_t *ppos,
1125			 struct pipe_inode_info *pipe, size_t len,
1126			 unsigned int flags)
1127{
1128	ssize_t (*splice_read)(struct file *, loff_t *,
1129			       struct pipe_inode_info *, size_t, unsigned int);
1130	int ret;
1131
1132	if (unlikely(!(in->f_mode & FMODE_READ)))
1133		return -EBADF;
1134
1135	ret = rw_verify_area(READ, in, ppos, len);
1136	if (unlikely(ret < 0))
1137		return ret;
1138
1139	if (in->f_op->splice_read)
1140		splice_read = in->f_op->splice_read;
1141	else
1142		splice_read = default_file_splice_read;
1143
1144	return splice_read(in, ppos, pipe, len, flags);
1145}
1146
1147/**
1148 * splice_direct_to_actor - splices data directly between two non-pipes
1149 * @in:		file to splice from
1150 * @sd:		actor information on where to splice to
1151 * @actor:	handles the data splicing
1152 *
1153 * Description:
1154 *    This is a special case helper to splice directly between two
1155 *    points, without requiring an explicit pipe. Internally an allocated
1156 *    pipe is cached in the process, and reused during the lifetime of
1157 *    that process.
1158 *
1159 */
1160ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1161			       splice_direct_actor *actor)
1162{
1163	struct pipe_inode_info *pipe;
1164	long ret, bytes;
1165	umode_t i_mode;
1166	size_t len;
1167	int i, flags, more;
1168
1169	/*
1170	 * We require the input being a regular file, as we don't want to
1171	 * randomly drop data for eg socket -> socket splicing. Use the
1172	 * piped splicing for that!
1173	 */
1174	i_mode = file_inode(in)->i_mode;
1175	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1176		return -EINVAL;
1177
1178	/*
1179	 * neither in nor out is a pipe, setup an internal pipe attached to
1180	 * 'out' and transfer the wanted data from 'in' to 'out' through that
1181	 */
1182	pipe = current->splice_pipe;
1183	if (unlikely(!pipe)) {
1184		pipe = alloc_pipe_info();
1185		if (!pipe)
1186			return -ENOMEM;
1187
1188		/*
1189		 * We don't have an immediate reader, but we'll read the stuff
1190		 * out of the pipe right after the splice_to_pipe(). So set
1191		 * PIPE_READERS appropriately.
1192		 */
1193		pipe->readers = 1;
1194
1195		current->splice_pipe = pipe;
1196	}
1197
1198	/*
1199	 * Do the splice.
1200	 */
1201	ret = 0;
1202	bytes = 0;
1203	len = sd->total_len;
1204	flags = sd->flags;
1205
1206	/*
1207	 * Don't block on output, we have to drain the direct pipe.
1208	 */
1209	sd->flags &= ~SPLICE_F_NONBLOCK;
1210	more = sd->flags & SPLICE_F_MORE;
1211
1212	while (len) {
1213		size_t read_len;
1214		loff_t pos = sd->pos, prev_pos = pos;
1215
1216		ret = do_splice_to(in, &pos, pipe, len, flags);
1217		if (unlikely(ret <= 0))
1218			goto out_release;
1219
1220		read_len = ret;
1221		sd->total_len = read_len;
1222
1223		/*
1224		 * If more data is pending, set SPLICE_F_MORE
1225		 * If this is the last data and SPLICE_F_MORE was not set
1226		 * initially, clears it.
1227		 */
1228		if (read_len < len)
1229			sd->flags |= SPLICE_F_MORE;
1230		else if (!more)
1231			sd->flags &= ~SPLICE_F_MORE;
1232		/*
1233		 * NOTE: nonblocking mode only applies to the input. We
1234		 * must not do the output in nonblocking mode as then we
1235		 * could get stuck data in the internal pipe:
1236		 */
1237		ret = actor(pipe, sd);
1238		if (unlikely(ret <= 0)) {
1239			sd->pos = prev_pos;
1240			goto out_release;
1241		}
1242
1243		bytes += ret;
1244		len -= ret;
1245		sd->pos = pos;
1246
1247		if (ret < read_len) {
1248			sd->pos = prev_pos + ret;
1249			goto out_release;
1250		}
1251	}
1252
1253done:
1254	pipe->nrbufs = pipe->curbuf = 0;
1255	file_accessed(in);
1256	return bytes;
1257
1258out_release:
1259	/*
1260	 * If we did an incomplete transfer we must release
1261	 * the pipe buffers in question:
1262	 */
1263	for (i = 0; i < pipe->buffers; i++) {
1264		struct pipe_buffer *buf = pipe->bufs + i;
1265
1266		if (buf->ops) {
1267			buf->ops->release(pipe, buf);
1268			buf->ops = NULL;
1269		}
1270	}
1271
1272	if (!bytes)
1273		bytes = ret;
1274
1275	goto done;
1276}
1277EXPORT_SYMBOL(splice_direct_to_actor);
1278
1279static int direct_splice_actor(struct pipe_inode_info *pipe,
1280			       struct splice_desc *sd)
1281{
1282	struct file *file = sd->u.file;
1283
1284	return do_splice_from(pipe, file, sd->opos, sd->total_len,
1285			      sd->flags);
1286}
1287
1288/**
1289 * do_splice_direct - splices data directly between two files
1290 * @in:		file to splice from
1291 * @ppos:	input file offset
1292 * @out:	file to splice to
1293 * @opos:	output file offset
1294 * @len:	number of bytes to splice
1295 * @flags:	splice modifier flags
1296 *
1297 * Description:
1298 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1299 *    doing it in the application would incur an extra system call
1300 *    (splice in + splice out, as compared to just sendfile()). So this helper
1301 *    can splice directly through a process-private pipe.
1302 *
1303 */
1304long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1305		      loff_t *opos, size_t len, unsigned int flags)
1306{
1307	struct splice_desc sd = {
1308		.len		= len,
1309		.total_len	= len,
1310		.flags		= flags,
1311		.pos		= *ppos,
1312		.u.file		= out,
1313		.opos		= opos,
1314	};
1315	long ret;
1316
1317	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1318		return -EBADF;
1319
1320	if (unlikely(out->f_flags & O_APPEND))
1321		return -EINVAL;
1322
1323	ret = rw_verify_area(WRITE, out, opos, len);
1324	if (unlikely(ret < 0))
1325		return ret;
1326
1327	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1328	if (ret > 0)
1329		*ppos = sd.pos;
1330
1331	return ret;
1332}
1333EXPORT_SYMBOL(do_splice_direct);
1334
1335static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1336			       struct pipe_inode_info *opipe,
1337			       size_t len, unsigned int flags);
1338
1339/*
1340 * Determine where to splice to/from.
1341 */
1342static long do_splice(struct file *in, loff_t __user *off_in,
1343		      struct file *out, loff_t __user *off_out,
1344		      size_t len, unsigned int flags)
1345{
1346	struct pipe_inode_info *ipipe;
1347	struct pipe_inode_info *opipe;
1348	loff_t offset;
1349	long ret;
1350
1351	ipipe = get_pipe_info(in);
1352	opipe = get_pipe_info(out);
1353
1354	if (ipipe && opipe) {
1355		if (off_in || off_out)
1356			return -ESPIPE;
1357
1358		if (!(in->f_mode & FMODE_READ))
1359			return -EBADF;
1360
1361		if (!(out->f_mode & FMODE_WRITE))
1362			return -EBADF;
1363
1364		/* Splicing to self would be fun, but... */
1365		if (ipipe == opipe)
1366			return -EINVAL;
1367
1368		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1369	}
1370
1371	if (ipipe) {
1372		if (off_in)
1373			return -ESPIPE;
1374		if (off_out) {
1375			if (!(out->f_mode & FMODE_PWRITE))
1376				return -EINVAL;
1377			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1378				return -EFAULT;
1379		} else {
1380			offset = out->f_pos;
1381		}
1382
1383		if (unlikely(!(out->f_mode & FMODE_WRITE)))
1384			return -EBADF;
1385
1386		if (unlikely(out->f_flags & O_APPEND))
1387			return -EINVAL;
1388
1389		ret = rw_verify_area(WRITE, out, &offset, len);
1390		if (unlikely(ret < 0))
1391			return ret;
1392
1393		file_start_write(out);
1394		ret = do_splice_from(ipipe, out, &offset, len, flags);
1395		file_end_write(out);
1396
1397		if (!off_out)
1398			out->f_pos = offset;
1399		else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1400			ret = -EFAULT;
1401
1402		return ret;
1403	}
1404
1405	if (opipe) {
1406		if (off_out)
1407			return -ESPIPE;
1408		if (off_in) {
1409			if (!(in->f_mode & FMODE_PREAD))
1410				return -EINVAL;
1411			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1412				return -EFAULT;
1413		} else {
1414			offset = in->f_pos;
1415		}
1416
1417		ret = do_splice_to(in, &offset, opipe, len, flags);
1418
1419		if (!off_in)
1420			in->f_pos = offset;
1421		else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1422			ret = -EFAULT;
1423
1424		return ret;
1425	}
1426
1427	return -EINVAL;
1428}
1429
1430/*
1431 * Map an iov into an array of pages and offset/length tupples. With the
1432 * partial_page structure, we can map several non-contiguous ranges into
1433 * our ones pages[] map instead of splitting that operation into pieces.
1434 * Could easily be exported as a generic helper for other users, in which
1435 * case one would probably want to add a 'max_nr_pages' parameter as well.
1436 */
1437static int get_iovec_page_array(const struct iovec __user *iov,
1438				unsigned int nr_vecs, struct page **pages,
1439				struct partial_page *partial, bool aligned,
1440				unsigned int pipe_buffers)
1441{
1442	int buffers = 0, error = 0;
1443
1444	while (nr_vecs) {
1445		unsigned long off, npages;
1446		struct iovec entry;
1447		void __user *base;
1448		size_t len;
1449		int i;
1450
1451		error = -EFAULT;
1452		if (copy_from_user(&entry, iov, sizeof(entry)))
1453			break;
1454
1455		base = entry.iov_base;
1456		len = entry.iov_len;
1457
1458		/*
1459		 * Sanity check this iovec. 0 read succeeds.
1460		 */
1461		error = 0;
1462		if (unlikely(!len))
1463			break;
1464		error = -EFAULT;
1465		if (!access_ok(VERIFY_READ, base, len))
1466			break;
1467
1468		/*
1469		 * Get this base offset and number of pages, then map
1470		 * in the user pages.
1471		 */
1472		off = (unsigned long) base & ~PAGE_MASK;
1473
1474		/*
1475		 * If asked for alignment, the offset must be zero and the
1476		 * length a multiple of the PAGE_SIZE.
1477		 */
1478		error = -EINVAL;
1479		if (aligned && (off || len & ~PAGE_MASK))
1480			break;
1481
1482		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1483		if (npages > pipe_buffers - buffers)
1484			npages = pipe_buffers - buffers;
1485
1486		error = get_user_pages_fast((unsigned long)base, npages,
1487					0, &pages[buffers]);
1488
1489		if (unlikely(error <= 0))
1490			break;
1491
1492		/*
1493		 * Fill this contiguous range into the partial page map.
1494		 */
1495		for (i = 0; i < error; i++) {
1496			const int plen = min_t(size_t, len, PAGE_SIZE - off);
1497
1498			partial[buffers].offset = off;
1499			partial[buffers].len = plen;
1500
1501			off = 0;
1502			len -= plen;
1503			buffers++;
1504		}
1505
1506		/*
1507		 * We didn't complete this iov, stop here since it probably
1508		 * means we have to move some of this into a pipe to
1509		 * be able to continue.
1510		 */
1511		if (len)
1512			break;
1513
1514		/*
1515		 * Don't continue if we mapped fewer pages than we asked for,
1516		 * or if we mapped the max number of pages that we have
1517		 * room for.
1518		 */
1519		if (error < npages || buffers == pipe_buffers)
1520			break;
1521
1522		nr_vecs--;
1523		iov++;
1524	}
1525
1526	if (buffers)
1527		return buffers;
1528
1529	return error;
1530}
1531
1532static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1533			struct splice_desc *sd)
1534{
1535	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1536	return n == sd->len ? n : -EFAULT;
1537}
1538
1539/*
1540 * For lack of a better implementation, implement vmsplice() to userspace
1541 * as a simple copy of the pipes pages to the user iov.
1542 */
1543static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1544			     unsigned long nr_segs, unsigned int flags)
1545{
1546	struct pipe_inode_info *pipe;
1547	struct splice_desc sd;
1548	long ret;
1549	struct iovec iovstack[UIO_FASTIOV];
1550	struct iovec *iov = iovstack;
1551	struct iov_iter iter;
1552
1553	pipe = get_pipe_info(file);
1554	if (!pipe)
1555		return -EBADF;
1556
1557	ret = import_iovec(READ, uiov, nr_segs,
1558			   ARRAY_SIZE(iovstack), &iov, &iter);
1559	if (ret < 0)
1560		return ret;
1561
1562	sd.total_len = iov_iter_count(&iter);
1563	sd.len = 0;
1564	sd.flags = flags;
1565	sd.u.data = &iter;
1566	sd.pos = 0;
1567
1568	if (sd.total_len) {
1569		pipe_lock(pipe);
1570		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1571		pipe_unlock(pipe);
1572	}
1573
1574	kfree(iov);
1575	return ret;
1576}
1577
1578/*
1579 * vmsplice splices a user address range into a pipe. It can be thought of
1580 * as splice-from-memory, where the regular splice is splice-from-file (or
1581 * to file). In both cases the output is a pipe, naturally.
1582 */
1583static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1584			     unsigned long nr_segs, unsigned int flags)
1585{
1586	struct pipe_inode_info *pipe;
1587	struct page *pages[PIPE_DEF_BUFFERS];
1588	struct partial_page partial[PIPE_DEF_BUFFERS];
1589	struct splice_pipe_desc spd = {
1590		.pages = pages,
1591		.partial = partial,
1592		.nr_pages_max = PIPE_DEF_BUFFERS,
1593		.flags = flags,
1594		.ops = &user_page_pipe_buf_ops,
1595		.spd_release = spd_release_page,
1596	};
1597	long ret;
1598
1599	pipe = get_pipe_info(file);
1600	if (!pipe)
1601		return -EBADF;
1602
1603	if (splice_grow_spd(pipe, &spd))
1604		return -ENOMEM;
1605
1606	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1607					    spd.partial, false,
1608					    spd.nr_pages_max);
1609	if (spd.nr_pages <= 0)
1610		ret = spd.nr_pages;
1611	else
1612		ret = splice_to_pipe(pipe, &spd);
1613
1614	splice_shrink_spd(&spd);
1615	return ret;
1616}
1617
1618/*
1619 * Note that vmsplice only really supports true splicing _from_ user memory
1620 * to a pipe, not the other way around. Splicing from user memory is a simple
1621 * operation that can be supported without any funky alignment restrictions
1622 * or nasty vm tricks. We simply map in the user memory and fill them into
1623 * a pipe. The reverse isn't quite as easy, though. There are two possible
1624 * solutions for that:
1625 *
1626 *	- memcpy() the data internally, at which point we might as well just
1627 *	  do a regular read() on the buffer anyway.
1628 *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1629 *	  has restriction limitations on both ends of the pipe).
1630 *
1631 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1632 *
1633 */
1634SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1635		unsigned long, nr_segs, unsigned int, flags)
1636{
1637	struct fd f;
1638	long error;
1639
1640	if (unlikely(nr_segs > UIO_MAXIOV))
1641		return -EINVAL;
1642	else if (unlikely(!nr_segs))
1643		return 0;
1644
1645	error = -EBADF;
1646	f = fdget(fd);
1647	if (f.file) {
1648		if (f.file->f_mode & FMODE_WRITE)
1649			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1650		else if (f.file->f_mode & FMODE_READ)
1651			error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1652
1653		fdput(f);
1654	}
1655
1656	return error;
1657}
1658
1659#ifdef CONFIG_COMPAT
1660COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1661		    unsigned int, nr_segs, unsigned int, flags)
1662{
1663	unsigned i;
1664	struct iovec __user *iov;
1665	if (nr_segs > UIO_MAXIOV)
1666		return -EINVAL;
1667	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1668	for (i = 0; i < nr_segs; i++) {
1669		struct compat_iovec v;
1670		if (get_user(v.iov_base, &iov32[i].iov_base) ||
1671		    get_user(v.iov_len, &iov32[i].iov_len) ||
1672		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1673		    put_user(v.iov_len, &iov[i].iov_len))
1674			return -EFAULT;
1675	}
1676	return sys_vmsplice(fd, iov, nr_segs, flags);
1677}
1678#endif
1679
1680SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1681		int, fd_out, loff_t __user *, off_out,
1682		size_t, len, unsigned int, flags)
1683{
1684	struct fd in, out;
1685	long error;
1686
1687	if (unlikely(!len))
1688		return 0;
1689
1690	error = -EBADF;
1691	in = fdget(fd_in);
1692	if (in.file) {
1693		if (in.file->f_mode & FMODE_READ) {
1694			out = fdget(fd_out);
1695			if (out.file) {
1696				if (out.file->f_mode & FMODE_WRITE)
1697					error = do_splice(in.file, off_in,
1698							  out.file, off_out,
1699							  len, flags);
1700				fdput(out);
1701			}
1702		}
1703		fdput(in);
1704	}
1705	return error;
1706}
1707
1708/*
1709 * Make sure there's data to read. Wait for input if we can, otherwise
1710 * return an appropriate error.
1711 */
1712static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1713{
1714	int ret;
1715
1716	/*
1717	 * Check ->nrbufs without the inode lock first. This function
1718	 * is speculative anyways, so missing one is ok.
1719	 */
1720	if (pipe->nrbufs)
1721		return 0;
1722
1723	ret = 0;
1724	pipe_lock(pipe);
1725
1726	while (!pipe->nrbufs) {
1727		if (signal_pending(current)) {
1728			ret = -ERESTARTSYS;
1729			break;
1730		}
1731		if (!pipe->writers)
1732			break;
1733		if (!pipe->waiting_writers) {
1734			if (flags & SPLICE_F_NONBLOCK) {
1735				ret = -EAGAIN;
1736				break;
1737			}
1738		}
1739		pipe_wait(pipe);
1740	}
1741
1742	pipe_unlock(pipe);
1743	return ret;
1744}
1745
1746/*
1747 * Make sure there's writeable room. Wait for room if we can, otherwise
1748 * return an appropriate error.
1749 */
1750static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1751{
1752	int ret;
1753
1754	/*
1755	 * Check ->nrbufs without the inode lock first. This function
1756	 * is speculative anyways, so missing one is ok.
1757	 */
1758	if (pipe->nrbufs < pipe->buffers)
1759		return 0;
1760
1761	ret = 0;
1762	pipe_lock(pipe);
1763
1764	while (pipe->nrbufs >= pipe->buffers) {
1765		if (!pipe->readers) {
1766			send_sig(SIGPIPE, current, 0);
1767			ret = -EPIPE;
1768			break;
1769		}
1770		if (flags & SPLICE_F_NONBLOCK) {
1771			ret = -EAGAIN;
1772			break;
1773		}
1774		if (signal_pending(current)) {
1775			ret = -ERESTARTSYS;
1776			break;
1777		}
1778		pipe->waiting_writers++;
1779		pipe_wait(pipe);
1780		pipe->waiting_writers--;
1781	}
1782
1783	pipe_unlock(pipe);
1784	return ret;
1785}
1786
1787/*
1788 * Splice contents of ipipe to opipe.
1789 */
1790static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1791			       struct pipe_inode_info *opipe,
1792			       size_t len, unsigned int flags)
1793{
1794	struct pipe_buffer *ibuf, *obuf;
1795	int ret = 0, nbuf;
1796	bool input_wakeup = false;
1797
1798
1799retry:
1800	ret = ipipe_prep(ipipe, flags);
1801	if (ret)
1802		return ret;
1803
1804	ret = opipe_prep(opipe, flags);
1805	if (ret)
1806		return ret;
1807
1808	/*
1809	 * Potential ABBA deadlock, work around it by ordering lock
1810	 * grabbing by pipe info address. Otherwise two different processes
1811	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1812	 */
1813	pipe_double_lock(ipipe, opipe);
1814
1815	do {
1816		if (!opipe->readers) {
1817			send_sig(SIGPIPE, current, 0);
1818			if (!ret)
1819				ret = -EPIPE;
1820			break;
1821		}
1822
1823		if (!ipipe->nrbufs && !ipipe->writers)
1824			break;
1825
1826		/*
1827		 * Cannot make any progress, because either the input
1828		 * pipe is empty or the output pipe is full.
1829		 */
1830		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1831			/* Already processed some buffers, break */
1832			if (ret)
1833				break;
1834
1835			if (flags & SPLICE_F_NONBLOCK) {
1836				ret = -EAGAIN;
1837				break;
1838			}
1839
1840			/*
1841			 * We raced with another reader/writer and haven't
1842			 * managed to process any buffers.  A zero return
1843			 * value means EOF, so retry instead.
1844			 */
1845			pipe_unlock(ipipe);
1846			pipe_unlock(opipe);
1847			goto retry;
1848		}
1849
1850		ibuf = ipipe->bufs + ipipe->curbuf;
1851		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1852		obuf = opipe->bufs + nbuf;
1853
1854		if (len >= ibuf->len) {
1855			/*
1856			 * Simply move the whole buffer from ipipe to opipe
1857			 */
1858			*obuf = *ibuf;
1859			ibuf->ops = NULL;
1860			opipe->nrbufs++;
1861			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1862			ipipe->nrbufs--;
1863			input_wakeup = true;
1864		} else {
1865			/*
1866			 * Get a reference to this pipe buffer,
1867			 * so we can copy the contents over.
1868			 */
1869			ibuf->ops->get(ipipe, ibuf);
1870			*obuf = *ibuf;
1871
1872			/*
1873			 * Don't inherit the gift flag, we need to
1874			 * prevent multiple steals of this page.
1875			 */
1876			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1877
1878			obuf->len = len;
1879			opipe->nrbufs++;
1880			ibuf->offset += obuf->len;
1881			ibuf->len -= obuf->len;
1882		}
1883		ret += obuf->len;
1884		len -= obuf->len;
1885	} while (len);
1886
1887	pipe_unlock(ipipe);
1888	pipe_unlock(opipe);
1889
1890	/*
1891	 * If we put data in the output pipe, wakeup any potential readers.
1892	 */
1893	if (ret > 0)
1894		wakeup_pipe_readers(opipe);
1895
1896	if (input_wakeup)
1897		wakeup_pipe_writers(ipipe);
1898
1899	return ret;
1900}
1901
1902/*
1903 * Link contents of ipipe to opipe.
1904 */
1905static int link_pipe(struct pipe_inode_info *ipipe,
1906		     struct pipe_inode_info *opipe,
1907		     size_t len, unsigned int flags)
1908{
1909	struct pipe_buffer *ibuf, *obuf;
1910	int ret = 0, i = 0, nbuf;
1911
1912	/*
1913	 * Potential ABBA deadlock, work around it by ordering lock
1914	 * grabbing by pipe info address. Otherwise two different processes
1915	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1916	 */
1917	pipe_double_lock(ipipe, opipe);
1918
1919	do {
1920		if (!opipe->readers) {
1921			send_sig(SIGPIPE, current, 0);
1922			if (!ret)
1923				ret = -EPIPE;
1924			break;
1925		}
1926
1927		/*
1928		 * If we have iterated all input buffers or ran out of
1929		 * output room, break.
1930		 */
1931		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1932			break;
1933
1934		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1935		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1936
1937		/*
1938		 * Get a reference to this pipe buffer,
1939		 * so we can copy the contents over.
1940		 */
1941		ibuf->ops->get(ipipe, ibuf);
1942
1943		obuf = opipe->bufs + nbuf;
1944		*obuf = *ibuf;
1945
1946		/*
1947		 * Don't inherit the gift flag, we need to
1948		 * prevent multiple steals of this page.
1949		 */
1950		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1951
1952		if (obuf->len > len)
1953			obuf->len = len;
1954
1955		opipe->nrbufs++;
1956		ret += obuf->len;
1957		len -= obuf->len;
1958		i++;
1959	} while (len);
1960
1961	/*
1962	 * return EAGAIN if we have the potential of some data in the
1963	 * future, otherwise just return 0
1964	 */
1965	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1966		ret = -EAGAIN;
1967
1968	pipe_unlock(ipipe);
1969	pipe_unlock(opipe);
1970
1971	/*
1972	 * If we put data in the output pipe, wakeup any potential readers.
1973	 */
1974	if (ret > 0)
1975		wakeup_pipe_readers(opipe);
1976
1977	return ret;
1978}
1979
1980/*
1981 * This is a tee(1) implementation that works on pipes. It doesn't copy
1982 * any data, it simply references the 'in' pages on the 'out' pipe.
1983 * The 'flags' used are the SPLICE_F_* variants, currently the only
1984 * applicable one is SPLICE_F_NONBLOCK.
1985 */
1986static long do_tee(struct file *in, struct file *out, size_t len,
1987		   unsigned int flags)
1988{
1989	struct pipe_inode_info *ipipe = get_pipe_info(in);
1990	struct pipe_inode_info *opipe = get_pipe_info(out);
1991	int ret = -EINVAL;
1992
1993	/*
1994	 * Duplicate the contents of ipipe to opipe without actually
1995	 * copying the data.
1996	 */
1997	if (ipipe && opipe && ipipe != opipe) {
1998		/*
1999		 * Keep going, unless we encounter an error. The ipipe/opipe
2000		 * ordering doesn't really matter.
2001		 */
2002		ret = ipipe_prep(ipipe, flags);
2003		if (!ret) {
2004			ret = opipe_prep(opipe, flags);
2005			if (!ret)
2006				ret = link_pipe(ipipe, opipe, len, flags);
2007		}
2008	}
2009
2010	return ret;
2011}
2012
2013SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2014{
2015	struct fd in;
2016	int error;
2017
2018	if (unlikely(!len))
2019		return 0;
2020
2021	error = -EBADF;
2022	in = fdget(fdin);
2023	if (in.file) {
2024		if (in.file->f_mode & FMODE_READ) {
2025			struct fd out = fdget(fdout);
2026			if (out.file) {
2027				if (out.file->f_mode & FMODE_WRITE)
2028					error = do_tee(in.file, out.file,
2029							len, flags);
2030				fdput(out);
2031			}
2032		}
2033 		fdput(in);
2034 	}
2035
2036	return error;
2037}
2038