root/kernel/pid.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. put_pid
  2. delayed_put_pid
  3. free_pid
  4. alloc_pid
  5. disable_pid_allocation
  6. find_pid_ns
  7. find_vpid
  8. task_pid_ptr
  9. attach_pid
  10. __change_pid
  11. detach_pid
  12. change_pid
  13. transfer_pid
  14. pid_task
  15. find_task_by_pid_ns
  16. find_task_by_vpid
  17. find_get_task_by_vpid
  18. get_task_pid
  19. get_pid_task
  20. find_get_pid
  21. pid_nr_ns
  22. pid_vnr
  23. __task_pid_nr_ns
  24. task_active_pid_ns
  25. find_ge_pid
  26. pidfd_create
  27. SYSCALL_DEFINE2
  28. pid_idr_init

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Generic pidhash and scalable, time-bounded PID allocator
   4  *
   5  * (C) 2002-2003 Nadia Yvette Chambers, IBM
   6  * (C) 2004 Nadia Yvette Chambers, Oracle
   7  * (C) 2002-2004 Ingo Molnar, Red Hat
   8  *
   9  * pid-structures are backing objects for tasks sharing a given ID to chain
  10  * against. There is very little to them aside from hashing them and
  11  * parking tasks using given ID's on a list.
  12  *
  13  * The hash is always changed with the tasklist_lock write-acquired,
  14  * and the hash is only accessed with the tasklist_lock at least
  15  * read-acquired, so there's no additional SMP locking needed here.
  16  *
  17  * We have a list of bitmap pages, which bitmaps represent the PID space.
  18  * Allocating and freeing PIDs is completely lockless. The worst-case
  19  * allocation scenario when all but one out of 1 million PIDs possible are
  20  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
  21  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
  22  *
  23  * Pid namespaces:
  24  *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
  25  *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
  26  *     Many thanks to Oleg Nesterov for comments and help
  27  *
  28  */
  29 
  30 #include <linux/mm.h>
  31 #include <linux/export.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/rculist.h>
  35 #include <linux/memblock.h>
  36 #include <linux/pid_namespace.h>
  37 #include <linux/init_task.h>
  38 #include <linux/syscalls.h>
  39 #include <linux/proc_ns.h>
  40 #include <linux/refcount.h>
  41 #include <linux/anon_inodes.h>
  42 #include <linux/sched/signal.h>
  43 #include <linux/sched/task.h>
  44 #include <linux/idr.h>
  45 
  46 struct pid init_struct_pid = {
  47         .count          = REFCOUNT_INIT(1),
  48         .tasks          = {
  49                 { .first = NULL },
  50                 { .first = NULL },
  51                 { .first = NULL },
  52         },
  53         .level          = 0,
  54         .numbers        = { {
  55                 .nr             = 0,
  56                 .ns             = &init_pid_ns,
  57         }, }
  58 };
  59 
  60 int pid_max = PID_MAX_DEFAULT;
  61 
  62 #define RESERVED_PIDS           300
  63 
  64 int pid_max_min = RESERVED_PIDS + 1;
  65 int pid_max_max = PID_MAX_LIMIT;
  66 
  67 /*
  68  * PID-map pages start out as NULL, they get allocated upon
  69  * first use and are never deallocated. This way a low pid_max
  70  * value does not cause lots of bitmaps to be allocated, but
  71  * the scheme scales to up to 4 million PIDs, runtime.
  72  */
  73 struct pid_namespace init_pid_ns = {
  74         .kref = KREF_INIT(2),
  75         .idr = IDR_INIT(init_pid_ns.idr),
  76         .pid_allocated = PIDNS_ADDING,
  77         .level = 0,
  78         .child_reaper = &init_task,
  79         .user_ns = &init_user_ns,
  80         .ns.inum = PROC_PID_INIT_INO,
  81 #ifdef CONFIG_PID_NS
  82         .ns.ops = &pidns_operations,
  83 #endif
  84 };
  85 EXPORT_SYMBOL_GPL(init_pid_ns);
  86 
  87 /*
  88  * Note: disable interrupts while the pidmap_lock is held as an
  89  * interrupt might come in and do read_lock(&tasklist_lock).
  90  *
  91  * If we don't disable interrupts there is a nasty deadlock between
  92  * detach_pid()->free_pid() and another cpu that does
  93  * spin_lock(&pidmap_lock) followed by an interrupt routine that does
  94  * read_lock(&tasklist_lock);
  95  *
  96  * After we clean up the tasklist_lock and know there are no
  97  * irq handlers that take it we can leave the interrupts enabled.
  98  * For now it is easier to be safe than to prove it can't happen.
  99  */
 100 
 101 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 102 
 103 void put_pid(struct pid *pid)
 104 {
 105         struct pid_namespace *ns;
 106 
 107         if (!pid)
 108                 return;
 109 
 110         ns = pid->numbers[pid->level].ns;
 111         if (refcount_dec_and_test(&pid->count)) {
 112                 kmem_cache_free(ns->pid_cachep, pid);
 113                 put_pid_ns(ns);
 114         }
 115 }
 116 EXPORT_SYMBOL_GPL(put_pid);
 117 
 118 static void delayed_put_pid(struct rcu_head *rhp)
 119 {
 120         struct pid *pid = container_of(rhp, struct pid, rcu);
 121         put_pid(pid);
 122 }
 123 
 124 void free_pid(struct pid *pid)
 125 {
 126         /* We can be called with write_lock_irq(&tasklist_lock) held */
 127         int i;
 128         unsigned long flags;
 129 
 130         spin_lock_irqsave(&pidmap_lock, flags);
 131         for (i = 0; i <= pid->level; i++) {
 132                 struct upid *upid = pid->numbers + i;
 133                 struct pid_namespace *ns = upid->ns;
 134                 switch (--ns->pid_allocated) {
 135                 case 2:
 136                 case 1:
 137                         /* When all that is left in the pid namespace
 138                          * is the reaper wake up the reaper.  The reaper
 139                          * may be sleeping in zap_pid_ns_processes().
 140                          */
 141                         wake_up_process(ns->child_reaper);
 142                         break;
 143                 case PIDNS_ADDING:
 144                         /* Handle a fork failure of the first process */
 145                         WARN_ON(ns->child_reaper);
 146                         ns->pid_allocated = 0;
 147                         /* fall through */
 148                 case 0:
 149                         schedule_work(&ns->proc_work);
 150                         break;
 151                 }
 152 
 153                 idr_remove(&ns->idr, upid->nr);
 154         }
 155         spin_unlock_irqrestore(&pidmap_lock, flags);
 156 
 157         call_rcu(&pid->rcu, delayed_put_pid);
 158 }
 159 
 160 struct pid *alloc_pid(struct pid_namespace *ns)
 161 {
 162         struct pid *pid;
 163         enum pid_type type;
 164         int i, nr;
 165         struct pid_namespace *tmp;
 166         struct upid *upid;
 167         int retval = -ENOMEM;
 168 
 169         pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 170         if (!pid)
 171                 return ERR_PTR(retval);
 172 
 173         tmp = ns;
 174         pid->level = ns->level;
 175 
 176         for (i = ns->level; i >= 0; i--) {
 177                 int pid_min = 1;
 178 
 179                 idr_preload(GFP_KERNEL);
 180                 spin_lock_irq(&pidmap_lock);
 181 
 182                 /*
 183                  * init really needs pid 1, but after reaching the maximum
 184                  * wrap back to RESERVED_PIDS
 185                  */
 186                 if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
 187                         pid_min = RESERVED_PIDS;
 188 
 189                 /*
 190                  * Store a null pointer so find_pid_ns does not find
 191                  * a partially initialized PID (see below).
 192                  */
 193                 nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
 194                                       pid_max, GFP_ATOMIC);
 195                 spin_unlock_irq(&pidmap_lock);
 196                 idr_preload_end();
 197 
 198                 if (nr < 0) {
 199                         retval = (nr == -ENOSPC) ? -EAGAIN : nr;
 200                         goto out_free;
 201                 }
 202 
 203                 pid->numbers[i].nr = nr;
 204                 pid->numbers[i].ns = tmp;
 205                 tmp = tmp->parent;
 206         }
 207 
 208         if (unlikely(is_child_reaper(pid))) {
 209                 if (pid_ns_prepare_proc(ns))
 210                         goto out_free;
 211         }
 212 
 213         get_pid_ns(ns);
 214         refcount_set(&pid->count, 1);
 215         for (type = 0; type < PIDTYPE_MAX; ++type)
 216                 INIT_HLIST_HEAD(&pid->tasks[type]);
 217 
 218         init_waitqueue_head(&pid->wait_pidfd);
 219 
 220         upid = pid->numbers + ns->level;
 221         spin_lock_irq(&pidmap_lock);
 222         if (!(ns->pid_allocated & PIDNS_ADDING))
 223                 goto out_unlock;
 224         for ( ; upid >= pid->numbers; --upid) {
 225                 /* Make the PID visible to find_pid_ns. */
 226                 idr_replace(&upid->ns->idr, pid, upid->nr);
 227                 upid->ns->pid_allocated++;
 228         }
 229         spin_unlock_irq(&pidmap_lock);
 230 
 231         return pid;
 232 
 233 out_unlock:
 234         spin_unlock_irq(&pidmap_lock);
 235         put_pid_ns(ns);
 236 
 237 out_free:
 238         spin_lock_irq(&pidmap_lock);
 239         while (++i <= ns->level) {
 240                 upid = pid->numbers + i;
 241                 idr_remove(&upid->ns->idr, upid->nr);
 242         }
 243 
 244         /* On failure to allocate the first pid, reset the state */
 245         if (ns->pid_allocated == PIDNS_ADDING)
 246                 idr_set_cursor(&ns->idr, 0);
 247 
 248         spin_unlock_irq(&pidmap_lock);
 249 
 250         kmem_cache_free(ns->pid_cachep, pid);
 251         return ERR_PTR(retval);
 252 }
 253 
 254 void disable_pid_allocation(struct pid_namespace *ns)
 255 {
 256         spin_lock_irq(&pidmap_lock);
 257         ns->pid_allocated &= ~PIDNS_ADDING;
 258         spin_unlock_irq(&pidmap_lock);
 259 }
 260 
 261 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 262 {
 263         return idr_find(&ns->idr, nr);
 264 }
 265 EXPORT_SYMBOL_GPL(find_pid_ns);
 266 
 267 struct pid *find_vpid(int nr)
 268 {
 269         return find_pid_ns(nr, task_active_pid_ns(current));
 270 }
 271 EXPORT_SYMBOL_GPL(find_vpid);
 272 
 273 static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
 274 {
 275         return (type == PIDTYPE_PID) ?
 276                 &task->thread_pid :
 277                 &task->signal->pids[type];
 278 }
 279 
 280 /*
 281  * attach_pid() must be called with the tasklist_lock write-held.
 282  */
 283 void attach_pid(struct task_struct *task, enum pid_type type)
 284 {
 285         struct pid *pid = *task_pid_ptr(task, type);
 286         hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
 287 }
 288 
 289 static void __change_pid(struct task_struct *task, enum pid_type type,
 290                         struct pid *new)
 291 {
 292         struct pid **pid_ptr = task_pid_ptr(task, type);
 293         struct pid *pid;
 294         int tmp;
 295 
 296         pid = *pid_ptr;
 297 
 298         hlist_del_rcu(&task->pid_links[type]);
 299         *pid_ptr = new;
 300 
 301         for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 302                 if (!hlist_empty(&pid->tasks[tmp]))
 303                         return;
 304 
 305         free_pid(pid);
 306 }
 307 
 308 void detach_pid(struct task_struct *task, enum pid_type type)
 309 {
 310         __change_pid(task, type, NULL);
 311 }
 312 
 313 void change_pid(struct task_struct *task, enum pid_type type,
 314                 struct pid *pid)
 315 {
 316         __change_pid(task, type, pid);
 317         attach_pid(task, type);
 318 }
 319 
 320 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
 321 void transfer_pid(struct task_struct *old, struct task_struct *new,
 322                            enum pid_type type)
 323 {
 324         if (type == PIDTYPE_PID)
 325                 new->thread_pid = old->thread_pid;
 326         hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
 327 }
 328 
 329 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 330 {
 331         struct task_struct *result = NULL;
 332         if (pid) {
 333                 struct hlist_node *first;
 334                 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
 335                                               lockdep_tasklist_lock_is_held());
 336                 if (first)
 337                         result = hlist_entry(first, struct task_struct, pid_links[(type)]);
 338         }
 339         return result;
 340 }
 341 EXPORT_SYMBOL(pid_task);
 342 
 343 /*
 344  * Must be called under rcu_read_lock().
 345  */
 346 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 347 {
 348         RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
 349                          "find_task_by_pid_ns() needs rcu_read_lock() protection");
 350         return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 351 }
 352 
 353 struct task_struct *find_task_by_vpid(pid_t vnr)
 354 {
 355         return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 356 }
 357 
 358 struct task_struct *find_get_task_by_vpid(pid_t nr)
 359 {
 360         struct task_struct *task;
 361 
 362         rcu_read_lock();
 363         task = find_task_by_vpid(nr);
 364         if (task)
 365                 get_task_struct(task);
 366         rcu_read_unlock();
 367 
 368         return task;
 369 }
 370 
 371 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 372 {
 373         struct pid *pid;
 374         rcu_read_lock();
 375         pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
 376         rcu_read_unlock();
 377         return pid;
 378 }
 379 EXPORT_SYMBOL_GPL(get_task_pid);
 380 
 381 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 382 {
 383         struct task_struct *result;
 384         rcu_read_lock();
 385         result = pid_task(pid, type);
 386         if (result)
 387                 get_task_struct(result);
 388         rcu_read_unlock();
 389         return result;
 390 }
 391 EXPORT_SYMBOL_GPL(get_pid_task);
 392 
 393 struct pid *find_get_pid(pid_t nr)
 394 {
 395         struct pid *pid;
 396 
 397         rcu_read_lock();
 398         pid = get_pid(find_vpid(nr));
 399         rcu_read_unlock();
 400 
 401         return pid;
 402 }
 403 EXPORT_SYMBOL_GPL(find_get_pid);
 404 
 405 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 406 {
 407         struct upid *upid;
 408         pid_t nr = 0;
 409 
 410         if (pid && ns->level <= pid->level) {
 411                 upid = &pid->numbers[ns->level];
 412                 if (upid->ns == ns)
 413                         nr = upid->nr;
 414         }
 415         return nr;
 416 }
 417 EXPORT_SYMBOL_GPL(pid_nr_ns);
 418 
 419 pid_t pid_vnr(struct pid *pid)
 420 {
 421         return pid_nr_ns(pid, task_active_pid_ns(current));
 422 }
 423 EXPORT_SYMBOL_GPL(pid_vnr);
 424 
 425 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 426                         struct pid_namespace *ns)
 427 {
 428         pid_t nr = 0;
 429 
 430         rcu_read_lock();
 431         if (!ns)
 432                 ns = task_active_pid_ns(current);
 433         if (likely(pid_alive(task)))
 434                 nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
 435         rcu_read_unlock();
 436 
 437         return nr;
 438 }
 439 EXPORT_SYMBOL(__task_pid_nr_ns);
 440 
 441 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 442 {
 443         return ns_of_pid(task_pid(tsk));
 444 }
 445 EXPORT_SYMBOL_GPL(task_active_pid_ns);
 446 
 447 /*
 448  * Used by proc to find the first pid that is greater than or equal to nr.
 449  *
 450  * If there is a pid at nr this function is exactly the same as find_pid_ns.
 451  */
 452 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 453 {
 454         return idr_get_next(&ns->idr, &nr);
 455 }
 456 
 457 /**
 458  * pidfd_create() - Create a new pid file descriptor.
 459  *
 460  * @pid:  struct pid that the pidfd will reference
 461  *
 462  * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 463  *
 464  * Note, that this function can only be called after the fd table has
 465  * been unshared to avoid leaking the pidfd to the new process.
 466  *
 467  * Return: On success, a cloexec pidfd is returned.
 468  *         On error, a negative errno number will be returned.
 469  */
 470 static int pidfd_create(struct pid *pid)
 471 {
 472         int fd;
 473 
 474         fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
 475                               O_RDWR | O_CLOEXEC);
 476         if (fd < 0)
 477                 put_pid(pid);
 478 
 479         return fd;
 480 }
 481 
 482 /**
 483  * pidfd_open() - Open new pid file descriptor.
 484  *
 485  * @pid:   pid for which to retrieve a pidfd
 486  * @flags: flags to pass
 487  *
 488  * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 489  * the process identified by @pid. Currently, the process identified by
 490  * @pid must be a thread-group leader. This restriction currently exists
 491  * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
 492  * be used with CLONE_THREAD) and pidfd polling (only supports thread group
 493  * leaders).
 494  *
 495  * Return: On success, a cloexec pidfd is returned.
 496  *         On error, a negative errno number will be returned.
 497  */
 498 SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 499 {
 500         int fd, ret;
 501         struct pid *p;
 502 
 503         if (flags)
 504                 return -EINVAL;
 505 
 506         if (pid <= 0)
 507                 return -EINVAL;
 508 
 509         p = find_get_pid(pid);
 510         if (!p)
 511                 return -ESRCH;
 512 
 513         ret = 0;
 514         rcu_read_lock();
 515         if (!pid_task(p, PIDTYPE_TGID))
 516                 ret = -EINVAL;
 517         rcu_read_unlock();
 518 
 519         fd = ret ?: pidfd_create(p);
 520         put_pid(p);
 521         return fd;
 522 }
 523 
 524 void __init pid_idr_init(void)
 525 {
 526         /* Verify no one has done anything silly: */
 527         BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 528 
 529         /* bump default and minimum pid_max based on number of cpus */
 530         pid_max = min(pid_max_max, max_t(int, pid_max,
 531                                 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 532         pid_max_min = max_t(int, pid_max_min,
 533                                 PIDS_PER_CPU_MIN * num_possible_cpus());
 534         pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 535 
 536         idr_init(&init_pid_ns.idr);
 537 
 538         init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 539                         SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 540 }

/* [<][>][^][v][top][bottom][index][help] */