root/fs/pipe.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. pipe_lock_nested
  2. pipe_lock
  3. pipe_unlock
  4. __pipe_lock
  5. __pipe_unlock
  6. pipe_double_lock
  7. pipe_wait
  8. anon_pipe_buf_release
  9. anon_pipe_buf_steal
  10. generic_pipe_buf_steal
  11. generic_pipe_buf_get
  12. generic_pipe_buf_confirm
  13. generic_pipe_buf_release
  14. pipe_buf_mark_unmergeable
  15. pipe_buf_can_merge
  16. pipe_read
  17. is_packetized
  18. pipe_write
  19. pipe_ioctl
  20. pipe_poll
  21. put_pipe_info
  22. pipe_release
  23. pipe_fasync
  24. account_pipe_buffers
  25. too_many_pipe_buffers_soft
  26. too_many_pipe_buffers_hard
  27. is_unprivileged_user
  28. alloc_pipe_info
  29. free_pipe_info
  30. pipefs_dname
  31. get_pipe_inode
  32. create_pipe_files
  33. __do_pipe_flags
  34. do_pipe_flags
  35. do_pipe2
  36. SYSCALL_DEFINE2
  37. SYSCALL_DEFINE1
  38. wait_for_partner
  39. wake_up_partner
  40. fifo_open
  41. round_pipe_size
  42. pipe_set_size
  43. get_pipe_info
  44. pipe_fcntl
  45. pipefs_init_fs_context
  46. init_pipe_fs

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/pipe.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
   6  */
   7 
   8 #include <linux/mm.h>
   9 #include <linux/file.h>
  10 #include <linux/poll.h>
  11 #include <linux/slab.h>
  12 #include <linux/module.h>
  13 #include <linux/init.h>
  14 #include <linux/fs.h>
  15 #include <linux/log2.h>
  16 #include <linux/mount.h>
  17 #include <linux/pseudo_fs.h>
  18 #include <linux/magic.h>
  19 #include <linux/pipe_fs_i.h>
  20 #include <linux/uio.h>
  21 #include <linux/highmem.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/audit.h>
  24 #include <linux/syscalls.h>
  25 #include <linux/fcntl.h>
  26 #include <linux/memcontrol.h>
  27 
  28 #include <linux/uaccess.h>
  29 #include <asm/ioctls.h>
  30 
  31 #include "internal.h"
  32 
  33 /*
  34  * The max size that a non-root user is allowed to grow the pipe. Can
  35  * be set by root in /proc/sys/fs/pipe-max-size
  36  */
  37 unsigned int pipe_max_size = 1048576;
  38 
  39 /* Maximum allocatable pages per user. Hard limit is unset by default, soft
  40  * matches default values.
  41  */
  42 unsigned long pipe_user_pages_hard;
  43 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  44 
  45 /*
  46  * We use a start+len construction, which provides full use of the 
  47  * allocated memory.
  48  * -- Florian Coosmann (FGC)
  49  * 
  50  * Reads with count = 0 should always return 0.
  51  * -- Julian Bradfield 1999-06-07.
  52  *
  53  * FIFOs and Pipes now generate SIGIO for both readers and writers.
  54  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  55  *
  56  * pipe_read & write cleanup
  57  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  58  */
  59 
  60 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  61 {
  62         if (pipe->files)
  63                 mutex_lock_nested(&pipe->mutex, subclass);
  64 }
  65 
  66 void pipe_lock(struct pipe_inode_info *pipe)
  67 {
  68         /*
  69          * pipe_lock() nests non-pipe inode locks (for writing to a file)
  70          */
  71         pipe_lock_nested(pipe, I_MUTEX_PARENT);
  72 }
  73 EXPORT_SYMBOL(pipe_lock);
  74 
  75 void pipe_unlock(struct pipe_inode_info *pipe)
  76 {
  77         if (pipe->files)
  78                 mutex_unlock(&pipe->mutex);
  79 }
  80 EXPORT_SYMBOL(pipe_unlock);
  81 
  82 static inline void __pipe_lock(struct pipe_inode_info *pipe)
  83 {
  84         mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  85 }
  86 
  87 static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  88 {
  89         mutex_unlock(&pipe->mutex);
  90 }
  91 
  92 void pipe_double_lock(struct pipe_inode_info *pipe1,
  93                       struct pipe_inode_info *pipe2)
  94 {
  95         BUG_ON(pipe1 == pipe2);
  96 
  97         if (pipe1 < pipe2) {
  98                 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
  99                 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
 100         } else {
 101                 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
 102                 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
 103         }
 104 }
 105 
 106 /* Drop the inode semaphore and wait for a pipe event, atomically */
 107 void pipe_wait(struct pipe_inode_info *pipe)
 108 {
 109         DEFINE_WAIT(wait);
 110 
 111         /*
 112          * Pipes are system-local resources, so sleeping on them
 113          * is considered a noninteractive wait:
 114          */
 115         prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
 116         pipe_unlock(pipe);
 117         schedule();
 118         finish_wait(&pipe->wait, &wait);
 119         pipe_lock(pipe);
 120 }
 121 
 122 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 123                                   struct pipe_buffer *buf)
 124 {
 125         struct page *page = buf->page;
 126 
 127         /*
 128          * If nobody else uses this page, and we don't already have a
 129          * temporary page, let's keep track of it as a one-deep
 130          * allocation cache. (Otherwise just release our reference to it)
 131          */
 132         if (page_count(page) == 1 && !pipe->tmp_page)
 133                 pipe->tmp_page = page;
 134         else
 135                 put_page(page);
 136 }
 137 
 138 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 139                                struct pipe_buffer *buf)
 140 {
 141         struct page *page = buf->page;
 142 
 143         if (page_count(page) == 1) {
 144                 memcg_kmem_uncharge(page, 0);
 145                 __SetPageLocked(page);
 146                 return 0;
 147         }
 148         return 1;
 149 }
 150 
 151 /**
 152  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 153  * @pipe:       the pipe that the buffer belongs to
 154  * @buf:        the buffer to attempt to steal
 155  *
 156  * Description:
 157  *      This function attempts to steal the &struct page attached to
 158  *      @buf. If successful, this function returns 0 and returns with
 159  *      the page locked. The caller may then reuse the page for whatever
 160  *      he wishes; the typical use is insertion into a different file
 161  *      page cache.
 162  */
 163 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 164                            struct pipe_buffer *buf)
 165 {
 166         struct page *page = buf->page;
 167 
 168         /*
 169          * A reference of one is golden, that means that the owner of this
 170          * page is the only one holding a reference to it. lock the page
 171          * and return OK.
 172          */
 173         if (page_count(page) == 1) {
 174                 lock_page(page);
 175                 return 0;
 176         }
 177 
 178         return 1;
 179 }
 180 EXPORT_SYMBOL(generic_pipe_buf_steal);
 181 
 182 /**
 183  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 184  * @pipe:       the pipe that the buffer belongs to
 185  * @buf:        the buffer to get a reference to
 186  *
 187  * Description:
 188  *      This function grabs an extra reference to @buf. It's used in
 189  *      in the tee() system call, when we duplicate the buffers in one
 190  *      pipe into another.
 191  */
 192 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 193 {
 194         return try_get_page(buf->page);
 195 }
 196 EXPORT_SYMBOL(generic_pipe_buf_get);
 197 
 198 /**
 199  * generic_pipe_buf_confirm - verify contents of the pipe buffer
 200  * @info:       the pipe that the buffer belongs to
 201  * @buf:        the buffer to confirm
 202  *
 203  * Description:
 204  *      This function does nothing, because the generic pipe code uses
 205  *      pages that are always good when inserted into the pipe.
 206  */
 207 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 208                              struct pipe_buffer *buf)
 209 {
 210         return 0;
 211 }
 212 EXPORT_SYMBOL(generic_pipe_buf_confirm);
 213 
 214 /**
 215  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 216  * @pipe:       the pipe that the buffer belongs to
 217  * @buf:        the buffer to put a reference to
 218  *
 219  * Description:
 220  *      This function releases a reference to @buf.
 221  */
 222 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 223                               struct pipe_buffer *buf)
 224 {
 225         put_page(buf->page);
 226 }
 227 EXPORT_SYMBOL(generic_pipe_buf_release);
 228 
 229 /* New data written to a pipe may be appended to a buffer with this type. */
 230 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 231         .confirm = generic_pipe_buf_confirm,
 232         .release = anon_pipe_buf_release,
 233         .steal = anon_pipe_buf_steal,
 234         .get = generic_pipe_buf_get,
 235 };
 236 
 237 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
 238         .confirm = generic_pipe_buf_confirm,
 239         .release = anon_pipe_buf_release,
 240         .steal = anon_pipe_buf_steal,
 241         .get = generic_pipe_buf_get,
 242 };
 243 
 244 static const struct pipe_buf_operations packet_pipe_buf_ops = {
 245         .confirm = generic_pipe_buf_confirm,
 246         .release = anon_pipe_buf_release,
 247         .steal = anon_pipe_buf_steal,
 248         .get = generic_pipe_buf_get,
 249 };
 250 
 251 /**
 252  * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
 253  * @buf:        the buffer to mark
 254  *
 255  * Description:
 256  *      This function ensures that no future writes will be merged into the
 257  *      given &struct pipe_buffer. This is necessary when multiple pipe buffers
 258  *      share the same backing page.
 259  */
 260 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
 261 {
 262         if (buf->ops == &anon_pipe_buf_ops)
 263                 buf->ops = &anon_pipe_buf_nomerge_ops;
 264 }
 265 
 266 static bool pipe_buf_can_merge(struct pipe_buffer *buf)
 267 {
 268         return buf->ops == &anon_pipe_buf_ops;
 269 }
 270 
 271 static ssize_t
 272 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 273 {
 274         size_t total_len = iov_iter_count(to);
 275         struct file *filp = iocb->ki_filp;
 276         struct pipe_inode_info *pipe = filp->private_data;
 277         int do_wakeup;
 278         ssize_t ret;
 279 
 280         /* Null read succeeds. */
 281         if (unlikely(total_len == 0))
 282                 return 0;
 283 
 284         do_wakeup = 0;
 285         ret = 0;
 286         __pipe_lock(pipe);
 287         for (;;) {
 288                 int bufs = pipe->nrbufs;
 289                 if (bufs) {
 290                         int curbuf = pipe->curbuf;
 291                         struct pipe_buffer *buf = pipe->bufs + curbuf;
 292                         size_t chars = buf->len;
 293                         size_t written;
 294                         int error;
 295 
 296                         if (chars > total_len)
 297                                 chars = total_len;
 298 
 299                         error = pipe_buf_confirm(pipe, buf);
 300                         if (error) {
 301                                 if (!ret)
 302                                         ret = error;
 303                                 break;
 304                         }
 305 
 306                         written = copy_page_to_iter(buf->page, buf->offset, chars, to);
 307                         if (unlikely(written < chars)) {
 308                                 if (!ret)
 309                                         ret = -EFAULT;
 310                                 break;
 311                         }
 312                         ret += chars;
 313                         buf->offset += chars;
 314                         buf->len -= chars;
 315 
 316                         /* Was it a packet buffer? Clean up and exit */
 317                         if (buf->flags & PIPE_BUF_FLAG_PACKET) {
 318                                 total_len = chars;
 319                                 buf->len = 0;
 320                         }
 321 
 322                         if (!buf->len) {
 323                                 pipe_buf_release(pipe, buf);
 324                                 curbuf = (curbuf + 1) & (pipe->buffers - 1);
 325                                 pipe->curbuf = curbuf;
 326                                 pipe->nrbufs = --bufs;
 327                                 do_wakeup = 1;
 328                         }
 329                         total_len -= chars;
 330                         if (!total_len)
 331                                 break;  /* common path: read succeeded */
 332                 }
 333                 if (bufs)       /* More to do? */
 334                         continue;
 335                 if (!pipe->writers)
 336                         break;
 337                 if (!pipe->waiting_writers) {
 338                         /* syscall merging: Usually we must not sleep
 339                          * if O_NONBLOCK is set, or if we got some data.
 340                          * But if a writer sleeps in kernel space, then
 341                          * we can wait for that data without violating POSIX.
 342                          */
 343                         if (ret)
 344                                 break;
 345                         if (filp->f_flags & O_NONBLOCK) {
 346                                 ret = -EAGAIN;
 347                                 break;
 348                         }
 349                 }
 350                 if (signal_pending(current)) {
 351                         if (!ret)
 352                                 ret = -ERESTARTSYS;
 353                         break;
 354                 }
 355                 if (do_wakeup) {
 356                         wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
 357                         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 358                 }
 359                 pipe_wait(pipe);
 360         }
 361         __pipe_unlock(pipe);
 362 
 363         /* Signal writers asynchronously that there is more room. */
 364         if (do_wakeup) {
 365                 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
 366                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 367         }
 368         if (ret > 0)
 369                 file_accessed(filp);
 370         return ret;
 371 }
 372 
 373 static inline int is_packetized(struct file *file)
 374 {
 375         return (file->f_flags & O_DIRECT) != 0;
 376 }
 377 
 378 static ssize_t
 379 pipe_write(struct kiocb *iocb, struct iov_iter *from)
 380 {
 381         struct file *filp = iocb->ki_filp;
 382         struct pipe_inode_info *pipe = filp->private_data;
 383         ssize_t ret = 0;
 384         int do_wakeup = 0;
 385         size_t total_len = iov_iter_count(from);
 386         ssize_t chars;
 387 
 388         /* Null write succeeds. */
 389         if (unlikely(total_len == 0))
 390                 return 0;
 391 
 392         __pipe_lock(pipe);
 393 
 394         if (!pipe->readers) {
 395                 send_sig(SIGPIPE, current, 0);
 396                 ret = -EPIPE;
 397                 goto out;
 398         }
 399 
 400         /* We try to merge small writes */
 401         chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 402         if (pipe->nrbufs && chars != 0) {
 403                 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
 404                                                         (pipe->buffers - 1);
 405                 struct pipe_buffer *buf = pipe->bufs + lastbuf;
 406                 int offset = buf->offset + buf->len;
 407 
 408                 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
 409                         ret = pipe_buf_confirm(pipe, buf);
 410                         if (ret)
 411                                 goto out;
 412 
 413                         ret = copy_page_from_iter(buf->page, offset, chars, from);
 414                         if (unlikely(ret < chars)) {
 415                                 ret = -EFAULT;
 416                                 goto out;
 417                         }
 418                         do_wakeup = 1;
 419                         buf->len += ret;
 420                         if (!iov_iter_count(from))
 421                                 goto out;
 422                 }
 423         }
 424 
 425         for (;;) {
 426                 int bufs;
 427 
 428                 if (!pipe->readers) {
 429                         send_sig(SIGPIPE, current, 0);
 430                         if (!ret)
 431                                 ret = -EPIPE;
 432                         break;
 433                 }
 434                 bufs = pipe->nrbufs;
 435                 if (bufs < pipe->buffers) {
 436                         int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 437                         struct pipe_buffer *buf = pipe->bufs + newbuf;
 438                         struct page *page = pipe->tmp_page;
 439                         int copied;
 440 
 441                         if (!page) {
 442                                 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
 443                                 if (unlikely(!page)) {
 444                                         ret = ret ? : -ENOMEM;
 445                                         break;
 446                                 }
 447                                 pipe->tmp_page = page;
 448                         }
 449                         /* Always wake up, even if the copy fails. Otherwise
 450                          * we lock up (O_NONBLOCK-)readers that sleep due to
 451                          * syscall merging.
 452                          * FIXME! Is this really true?
 453                          */
 454                         do_wakeup = 1;
 455                         copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
 456                         if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
 457                                 if (!ret)
 458                                         ret = -EFAULT;
 459                                 break;
 460                         }
 461                         ret += copied;
 462 
 463                         /* Insert it into the buffer array */
 464                         buf->page = page;
 465                         buf->ops = &anon_pipe_buf_ops;
 466                         buf->offset = 0;
 467                         buf->len = copied;
 468                         buf->flags = 0;
 469                         if (is_packetized(filp)) {
 470                                 buf->ops = &packet_pipe_buf_ops;
 471                                 buf->flags = PIPE_BUF_FLAG_PACKET;
 472                         }
 473                         pipe->nrbufs = ++bufs;
 474                         pipe->tmp_page = NULL;
 475 
 476                         if (!iov_iter_count(from))
 477                                 break;
 478                 }
 479                 if (bufs < pipe->buffers)
 480                         continue;
 481                 if (filp->f_flags & O_NONBLOCK) {
 482                         if (!ret)
 483                                 ret = -EAGAIN;
 484                         break;
 485                 }
 486                 if (signal_pending(current)) {
 487                         if (!ret)
 488                                 ret = -ERESTARTSYS;
 489                         break;
 490                 }
 491                 if (do_wakeup) {
 492                         wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
 493                         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 494                         do_wakeup = 0;
 495                 }
 496                 pipe->waiting_writers++;
 497                 pipe_wait(pipe);
 498                 pipe->waiting_writers--;
 499         }
 500 out:
 501         __pipe_unlock(pipe);
 502         if (do_wakeup) {
 503                 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
 504                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 505         }
 506         if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
 507                 int err = file_update_time(filp);
 508                 if (err)
 509                         ret = err;
 510                 sb_end_write(file_inode(filp)->i_sb);
 511         }
 512         return ret;
 513 }
 514 
 515 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 516 {
 517         struct pipe_inode_info *pipe = filp->private_data;
 518         int count, buf, nrbufs;
 519 
 520         switch (cmd) {
 521                 case FIONREAD:
 522                         __pipe_lock(pipe);
 523                         count = 0;
 524                         buf = pipe->curbuf;
 525                         nrbufs = pipe->nrbufs;
 526                         while (--nrbufs >= 0) {
 527                                 count += pipe->bufs[buf].len;
 528                                 buf = (buf+1) & (pipe->buffers - 1);
 529                         }
 530                         __pipe_unlock(pipe);
 531 
 532                         return put_user(count, (int __user *)arg);
 533                 default:
 534                         return -ENOIOCTLCMD;
 535         }
 536 }
 537 
 538 /* No kernel lock held - fine */
 539 static __poll_t
 540 pipe_poll(struct file *filp, poll_table *wait)
 541 {
 542         __poll_t mask;
 543         struct pipe_inode_info *pipe = filp->private_data;
 544         int nrbufs;
 545 
 546         poll_wait(filp, &pipe->wait, wait);
 547 
 548         /* Reading only -- no need for acquiring the semaphore.  */
 549         nrbufs = pipe->nrbufs;
 550         mask = 0;
 551         if (filp->f_mode & FMODE_READ) {
 552                 mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
 553                 if (!pipe->writers && filp->f_version != pipe->w_counter)
 554                         mask |= EPOLLHUP;
 555         }
 556 
 557         if (filp->f_mode & FMODE_WRITE) {
 558                 mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
 559                 /*
 560                  * Most Unices do not set EPOLLERR for FIFOs but on Linux they
 561                  * behave exactly like pipes for poll().
 562                  */
 563                 if (!pipe->readers)
 564                         mask |= EPOLLERR;
 565         }
 566 
 567         return mask;
 568 }
 569 
 570 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
 571 {
 572         int kill = 0;
 573 
 574         spin_lock(&inode->i_lock);
 575         if (!--pipe->files) {
 576                 inode->i_pipe = NULL;
 577                 kill = 1;
 578         }
 579         spin_unlock(&inode->i_lock);
 580 
 581         if (kill)
 582                 free_pipe_info(pipe);
 583 }
 584 
 585 static int
 586 pipe_release(struct inode *inode, struct file *file)
 587 {
 588         struct pipe_inode_info *pipe = file->private_data;
 589 
 590         __pipe_lock(pipe);
 591         if (file->f_mode & FMODE_READ)
 592                 pipe->readers--;
 593         if (file->f_mode & FMODE_WRITE)
 594                 pipe->writers--;
 595 
 596         if (pipe->readers || pipe->writers) {
 597                 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
 598                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 599                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 600         }
 601         __pipe_unlock(pipe);
 602 
 603         put_pipe_info(inode, pipe);
 604         return 0;
 605 }
 606 
 607 static int
 608 pipe_fasync(int fd, struct file *filp, int on)
 609 {
 610         struct pipe_inode_info *pipe = filp->private_data;
 611         int retval = 0;
 612 
 613         __pipe_lock(pipe);
 614         if (filp->f_mode & FMODE_READ)
 615                 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 616         if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 617                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 618                 if (retval < 0 && (filp->f_mode & FMODE_READ))
 619                         /* this can happen only if on == T */
 620                         fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 621         }
 622         __pipe_unlock(pipe);
 623         return retval;
 624 }
 625 
 626 static unsigned long account_pipe_buffers(struct user_struct *user,
 627                                  unsigned long old, unsigned long new)
 628 {
 629         return atomic_long_add_return(new - old, &user->pipe_bufs);
 630 }
 631 
 632 static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
 633 {
 634         unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
 635 
 636         return soft_limit && user_bufs > soft_limit;
 637 }
 638 
 639 static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 640 {
 641         unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
 642 
 643         return hard_limit && user_bufs > hard_limit;
 644 }
 645 
 646 static bool is_unprivileged_user(void)
 647 {
 648         return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
 649 }
 650 
 651 struct pipe_inode_info *alloc_pipe_info(void)
 652 {
 653         struct pipe_inode_info *pipe;
 654         unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 655         struct user_struct *user = get_current_user();
 656         unsigned long user_bufs;
 657         unsigned int max_size = READ_ONCE(pipe_max_size);
 658 
 659         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 660         if (pipe == NULL)
 661                 goto out_free_uid;
 662 
 663         if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
 664                 pipe_bufs = max_size >> PAGE_SHIFT;
 665 
 666         user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 667 
 668         if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
 669                 user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
 670                 pipe_bufs = 1;
 671         }
 672 
 673         if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
 674                 goto out_revert_acct;
 675 
 676         pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
 677                              GFP_KERNEL_ACCOUNT);
 678 
 679         if (pipe->bufs) {
 680                 init_waitqueue_head(&pipe->wait);
 681                 pipe->r_counter = pipe->w_counter = 1;
 682                 pipe->buffers = pipe_bufs;
 683                 pipe->user = user;
 684                 mutex_init(&pipe->mutex);
 685                 return pipe;
 686         }
 687 
 688 out_revert_acct:
 689         (void) account_pipe_buffers(user, pipe_bufs, 0);
 690         kfree(pipe);
 691 out_free_uid:
 692         free_uid(user);
 693         return NULL;
 694 }
 695 
 696 void free_pipe_info(struct pipe_inode_info *pipe)
 697 {
 698         int i;
 699 
 700         (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
 701         free_uid(pipe->user);
 702         for (i = 0; i < pipe->buffers; i++) {
 703                 struct pipe_buffer *buf = pipe->bufs + i;
 704                 if (buf->ops)
 705                         pipe_buf_release(pipe, buf);
 706         }
 707         if (pipe->tmp_page)
 708                 __free_page(pipe->tmp_page);
 709         kfree(pipe->bufs);
 710         kfree(pipe);
 711 }
 712 
 713 static struct vfsmount *pipe_mnt __read_mostly;
 714 
 715 /*
 716  * pipefs_dname() is called from d_path().
 717  */
 718 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 719 {
 720         return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 721                                 d_inode(dentry)->i_ino);
 722 }
 723 
 724 static const struct dentry_operations pipefs_dentry_operations = {
 725         .d_dname        = pipefs_dname,
 726 };
 727 
 728 static struct inode * get_pipe_inode(void)
 729 {
 730         struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
 731         struct pipe_inode_info *pipe;
 732 
 733         if (!inode)
 734                 goto fail_inode;
 735 
 736         inode->i_ino = get_next_ino();
 737 
 738         pipe = alloc_pipe_info();
 739         if (!pipe)
 740                 goto fail_iput;
 741 
 742         inode->i_pipe = pipe;
 743         pipe->files = 2;
 744         pipe->readers = pipe->writers = 1;
 745         inode->i_fop = &pipefifo_fops;
 746 
 747         /*
 748          * Mark the inode dirty from the very beginning,
 749          * that way it will never be moved to the dirty
 750          * list because "mark_inode_dirty()" will think
 751          * that it already _is_ on the dirty list.
 752          */
 753         inode->i_state = I_DIRTY;
 754         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 755         inode->i_uid = current_fsuid();
 756         inode->i_gid = current_fsgid();
 757         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 758 
 759         return inode;
 760 
 761 fail_iput:
 762         iput(inode);
 763 
 764 fail_inode:
 765         return NULL;
 766 }
 767 
 768 int create_pipe_files(struct file **res, int flags)
 769 {
 770         struct inode *inode = get_pipe_inode();
 771         struct file *f;
 772 
 773         if (!inode)
 774                 return -ENFILE;
 775 
 776         f = alloc_file_pseudo(inode, pipe_mnt, "",
 777                                 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
 778                                 &pipefifo_fops);
 779         if (IS_ERR(f)) {
 780                 free_pipe_info(inode->i_pipe);
 781                 iput(inode);
 782                 return PTR_ERR(f);
 783         }
 784 
 785         f->private_data = inode->i_pipe;
 786 
 787         res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
 788                                   &pipefifo_fops);
 789         if (IS_ERR(res[0])) {
 790                 put_pipe_info(inode, inode->i_pipe);
 791                 fput(f);
 792                 return PTR_ERR(res[0]);
 793         }
 794         res[0]->private_data = inode->i_pipe;
 795         res[1] = f;
 796         return 0;
 797 }
 798 
 799 static int __do_pipe_flags(int *fd, struct file **files, int flags)
 800 {
 801         int error;
 802         int fdw, fdr;
 803 
 804         if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
 805                 return -EINVAL;
 806 
 807         error = create_pipe_files(files, flags);
 808         if (error)
 809                 return error;
 810 
 811         error = get_unused_fd_flags(flags);
 812         if (error < 0)
 813                 goto err_read_pipe;
 814         fdr = error;
 815 
 816         error = get_unused_fd_flags(flags);
 817         if (error < 0)
 818                 goto err_fdr;
 819         fdw = error;
 820 
 821         audit_fd_pair(fdr, fdw);
 822         fd[0] = fdr;
 823         fd[1] = fdw;
 824         return 0;
 825 
 826  err_fdr:
 827         put_unused_fd(fdr);
 828  err_read_pipe:
 829         fput(files[0]);
 830         fput(files[1]);
 831         return error;
 832 }
 833 
 834 int do_pipe_flags(int *fd, int flags)
 835 {
 836         struct file *files[2];
 837         int error = __do_pipe_flags(fd, files, flags);
 838         if (!error) {
 839                 fd_install(fd[0], files[0]);
 840                 fd_install(fd[1], files[1]);
 841         }
 842         return error;
 843 }
 844 
 845 /*
 846  * sys_pipe() is the normal C calling standard for creating
 847  * a pipe. It's not the way Unix traditionally does this, though.
 848  */
 849 static int do_pipe2(int __user *fildes, int flags)
 850 {
 851         struct file *files[2];
 852         int fd[2];
 853         int error;
 854 
 855         error = __do_pipe_flags(fd, files, flags);
 856         if (!error) {
 857                 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
 858                         fput(files[0]);
 859                         fput(files[1]);
 860                         put_unused_fd(fd[0]);
 861                         put_unused_fd(fd[1]);
 862                         error = -EFAULT;
 863                 } else {
 864                         fd_install(fd[0], files[0]);
 865                         fd_install(fd[1], files[1]);
 866                 }
 867         }
 868         return error;
 869 }
 870 
 871 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 872 {
 873         return do_pipe2(fildes, flags);
 874 }
 875 
 876 SYSCALL_DEFINE1(pipe, int __user *, fildes)
 877 {
 878         return do_pipe2(fildes, 0);
 879 }
 880 
 881 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 882 {
 883         int cur = *cnt; 
 884 
 885         while (cur == *cnt) {
 886                 pipe_wait(pipe);
 887                 if (signal_pending(current))
 888                         break;
 889         }
 890         return cur == *cnt ? -ERESTARTSYS : 0;
 891 }
 892 
 893 static void wake_up_partner(struct pipe_inode_info *pipe)
 894 {
 895         wake_up_interruptible(&pipe->wait);
 896 }
 897 
 898 static int fifo_open(struct inode *inode, struct file *filp)
 899 {
 900         struct pipe_inode_info *pipe;
 901         bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
 902         int ret;
 903 
 904         filp->f_version = 0;
 905 
 906         spin_lock(&inode->i_lock);
 907         if (inode->i_pipe) {
 908                 pipe = inode->i_pipe;
 909                 pipe->files++;
 910                 spin_unlock(&inode->i_lock);
 911         } else {
 912                 spin_unlock(&inode->i_lock);
 913                 pipe = alloc_pipe_info();
 914                 if (!pipe)
 915                         return -ENOMEM;
 916                 pipe->files = 1;
 917                 spin_lock(&inode->i_lock);
 918                 if (unlikely(inode->i_pipe)) {
 919                         inode->i_pipe->files++;
 920                         spin_unlock(&inode->i_lock);
 921                         free_pipe_info(pipe);
 922                         pipe = inode->i_pipe;
 923                 } else {
 924                         inode->i_pipe = pipe;
 925                         spin_unlock(&inode->i_lock);
 926                 }
 927         }
 928         filp->private_data = pipe;
 929         /* OK, we have a pipe and it's pinned down */
 930 
 931         __pipe_lock(pipe);
 932 
 933         /* We can only do regular read/write on fifos */
 934         filp->f_mode &= (FMODE_READ | FMODE_WRITE);
 935 
 936         switch (filp->f_mode) {
 937         case FMODE_READ:
 938         /*
 939          *  O_RDONLY
 940          *  POSIX.1 says that O_NONBLOCK means return with the FIFO
 941          *  opened, even when there is no process writing the FIFO.
 942          */
 943                 pipe->r_counter++;
 944                 if (pipe->readers++ == 0)
 945                         wake_up_partner(pipe);
 946 
 947                 if (!is_pipe && !pipe->writers) {
 948                         if ((filp->f_flags & O_NONBLOCK)) {
 949                                 /* suppress EPOLLHUP until we have
 950                                  * seen a writer */
 951                                 filp->f_version = pipe->w_counter;
 952                         } else {
 953                                 if (wait_for_partner(pipe, &pipe->w_counter))
 954                                         goto err_rd;
 955                         }
 956                 }
 957                 break;
 958         
 959         case FMODE_WRITE:
 960         /*
 961          *  O_WRONLY
 962          *  POSIX.1 says that O_NONBLOCK means return -1 with
 963          *  errno=ENXIO when there is no process reading the FIFO.
 964          */
 965                 ret = -ENXIO;
 966                 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
 967                         goto err;
 968 
 969                 pipe->w_counter++;
 970                 if (!pipe->writers++)
 971                         wake_up_partner(pipe);
 972 
 973                 if (!is_pipe && !pipe->readers) {
 974                         if (wait_for_partner(pipe, &pipe->r_counter))
 975                                 goto err_wr;
 976                 }
 977                 break;
 978         
 979         case FMODE_READ | FMODE_WRITE:
 980         /*
 981          *  O_RDWR
 982          *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
 983          *  This implementation will NEVER block on a O_RDWR open, since
 984          *  the process can at least talk to itself.
 985          */
 986 
 987                 pipe->readers++;
 988                 pipe->writers++;
 989                 pipe->r_counter++;
 990                 pipe->w_counter++;
 991                 if (pipe->readers == 1 || pipe->writers == 1)
 992                         wake_up_partner(pipe);
 993                 break;
 994 
 995         default:
 996                 ret = -EINVAL;
 997                 goto err;
 998         }
 999 
1000         /* Ok! */
1001         __pipe_unlock(pipe);
1002         return 0;
1003 
1004 err_rd:
1005         if (!--pipe->readers)
1006                 wake_up_interruptible(&pipe->wait);
1007         ret = -ERESTARTSYS;
1008         goto err;
1009 
1010 err_wr:
1011         if (!--pipe->writers)
1012                 wake_up_interruptible(&pipe->wait);
1013         ret = -ERESTARTSYS;
1014         goto err;
1015 
1016 err:
1017         __pipe_unlock(pipe);
1018 
1019         put_pipe_info(inode, pipe);
1020         return ret;
1021 }
1022 
1023 const struct file_operations pipefifo_fops = {
1024         .open           = fifo_open,
1025         .llseek         = no_llseek,
1026         .read_iter      = pipe_read,
1027         .write_iter     = pipe_write,
1028         .poll           = pipe_poll,
1029         .unlocked_ioctl = pipe_ioctl,
1030         .release        = pipe_release,
1031         .fasync         = pipe_fasync,
1032 };
1033 
1034 /*
1035  * Currently we rely on the pipe array holding a power-of-2 number
1036  * of pages. Returns 0 on error.
1037  */
1038 unsigned int round_pipe_size(unsigned long size)
1039 {
1040         if (size > (1U << 31))
1041                 return 0;
1042 
1043         /* Minimum pipe size, as required by POSIX */
1044         if (size < PAGE_SIZE)
1045                 return PAGE_SIZE;
1046 
1047         return roundup_pow_of_two(size);
1048 }
1049 
1050 /*
1051  * Allocate a new array of pipe buffers and copy the info over. Returns the
1052  * pipe size if successful, or return -ERROR on error.
1053  */
1054 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1055 {
1056         struct pipe_buffer *bufs;
1057         unsigned int size, nr_pages;
1058         unsigned long user_bufs;
1059         long ret = 0;
1060 
1061         size = round_pipe_size(arg);
1062         nr_pages = size >> PAGE_SHIFT;
1063 
1064         if (!nr_pages)
1065                 return -EINVAL;
1066 
1067         /*
1068          * If trying to increase the pipe capacity, check that an
1069          * unprivileged user is not trying to exceed various limits
1070          * (soft limit check here, hard limit check just below).
1071          * Decreasing the pipe capacity is always permitted, even
1072          * if the user is currently over a limit.
1073          */
1074         if (nr_pages > pipe->buffers &&
1075                         size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1076                 return -EPERM;
1077 
1078         user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
1079 
1080         if (nr_pages > pipe->buffers &&
1081                         (too_many_pipe_buffers_hard(user_bufs) ||
1082                          too_many_pipe_buffers_soft(user_bufs)) &&
1083                         is_unprivileged_user()) {
1084                 ret = -EPERM;
1085                 goto out_revert_acct;
1086         }
1087 
1088         /*
1089          * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1090          * expect a lot of shrink+grow operations, just free and allocate
1091          * again like we would do for growing. If the pipe currently
1092          * contains more buffers than arg, then return busy.
1093          */
1094         if (nr_pages < pipe->nrbufs) {
1095                 ret = -EBUSY;
1096                 goto out_revert_acct;
1097         }
1098 
1099         bufs = kcalloc(nr_pages, sizeof(*bufs),
1100                        GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1101         if (unlikely(!bufs)) {
1102                 ret = -ENOMEM;
1103                 goto out_revert_acct;
1104         }
1105 
1106         /*
1107          * The pipe array wraps around, so just start the new one at zero
1108          * and adjust the indexes.
1109          */
1110         if (pipe->nrbufs) {
1111                 unsigned int tail;
1112                 unsigned int head;
1113 
1114                 tail = pipe->curbuf + pipe->nrbufs;
1115                 if (tail < pipe->buffers)
1116                         tail = 0;
1117                 else
1118                         tail &= (pipe->buffers - 1);
1119 
1120                 head = pipe->nrbufs - tail;
1121                 if (head)
1122                         memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1123                 if (tail)
1124                         memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1125         }
1126 
1127         pipe->curbuf = 0;
1128         kfree(pipe->bufs);
1129         pipe->bufs = bufs;
1130         pipe->buffers = nr_pages;
1131         return nr_pages * PAGE_SIZE;
1132 
1133 out_revert_acct:
1134         (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
1135         return ret;
1136 }
1137 
1138 /*
1139  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1140  * location, so checking ->i_pipe is not enough to verify that this is a
1141  * pipe.
1142  */
1143 struct pipe_inode_info *get_pipe_info(struct file *file)
1144 {
1145         return file->f_op == &pipefifo_fops ? file->private_data : NULL;
1146 }
1147 
1148 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1149 {
1150         struct pipe_inode_info *pipe;
1151         long ret;
1152 
1153         pipe = get_pipe_info(file);
1154         if (!pipe)
1155                 return -EBADF;
1156 
1157         __pipe_lock(pipe);
1158 
1159         switch (cmd) {
1160         case F_SETPIPE_SZ:
1161                 ret = pipe_set_size(pipe, arg);
1162                 break;
1163         case F_GETPIPE_SZ:
1164                 ret = pipe->buffers * PAGE_SIZE;
1165                 break;
1166         default:
1167                 ret = -EINVAL;
1168                 break;
1169         }
1170 
1171         __pipe_unlock(pipe);
1172         return ret;
1173 }
1174 
1175 static const struct super_operations pipefs_ops = {
1176         .destroy_inode = free_inode_nonrcu,
1177         .statfs = simple_statfs,
1178 };
1179 
1180 /*
1181  * pipefs should _never_ be mounted by userland - too much of security hassle,
1182  * no real gain from having the whole whorehouse mounted. So we don't need
1183  * any operations on the root directory. However, we need a non-trivial
1184  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1185  */
1186 
1187 static int pipefs_init_fs_context(struct fs_context *fc)
1188 {
1189         struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1190         if (!ctx)
1191                 return -ENOMEM;
1192         ctx->ops = &pipefs_ops;
1193         ctx->dops = &pipefs_dentry_operations;
1194         return 0;
1195 }
1196 
1197 static struct file_system_type pipe_fs_type = {
1198         .name           = "pipefs",
1199         .init_fs_context = pipefs_init_fs_context,
1200         .kill_sb        = kill_anon_super,
1201 };
1202 
1203 static int __init init_pipe_fs(void)
1204 {
1205         int err = register_filesystem(&pipe_fs_type);
1206 
1207         if (!err) {
1208                 pipe_mnt = kern_mount(&pipe_fs_type);
1209                 if (IS_ERR(pipe_mnt)) {
1210                         err = PTR_ERR(pipe_mnt);
1211                         unregister_filesystem(&pipe_fs_type);
1212                 }
1213         }
1214         return err;
1215 }
1216 
1217 fs_initcall(init_pipe_fs);

/* [<][>][^][v][top][bottom][index][help] */