1 /*
2  *  Generic process-grouping system.
3  *
4  *  Based originally on the cpuset system, extracted by Paul Menage
5  *  Copyright (C) 2006 Google, Inc
6  *
7  *  Notifications support
8  *  Copyright (C) 2009 Nokia Corporation
9  *  Author: Kirill A. Shutemov
10  *
11  *  Copyright notices from the original cpuset code:
12  *  --------------------------------------------------
13  *  Copyright (C) 2003 BULL SA.
14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
15  *
16  *  Portions derived from Patrick Mochel's sysfs code.
17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
18  *
19  *  2003-10-10 Written by Simon Derr.
20  *  2003-10-22 Updates by Stephen Hemminger.
21  *  2004 May-July Rework by Paul Jackson.
22  *  ---------------------------------------------------
23  *
24  *  This file is subject to the terms and conditions of the GNU General Public
25  *  License.  See the file COPYING in the main directory of the Linux
26  *  distribution for more details.
27  */
28 
29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30 
31 #include <linux/cgroup.h>
32 #include <linux/cred.h>
33 #include <linux/ctype.h>
34 #include <linux/errno.h>
35 #include <linux/init_task.h>
36 #include <linux/kernel.h>
37 #include <linux/list.h>
38 #include <linux/magic.h>
39 #include <linux/mm.h>
40 #include <linux/mutex.h>
41 #include <linux/mount.h>
42 #include <linux/pagemap.h>
43 #include <linux/proc_fs.h>
44 #include <linux/rcupdate.h>
45 #include <linux/sched.h>
46 #include <linux/slab.h>
47 #include <linux/spinlock.h>
48 #include <linux/rwsem.h>
49 #include <linux/string.h>
50 #include <linux/sort.h>
51 #include <linux/kmod.h>
52 #include <linux/delayacct.h>
53 #include <linux/cgroupstats.h>
54 #include <linux/hashtable.h>
55 #include <linux/pid_namespace.h>
56 #include <linux/idr.h>
57 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58 #include <linux/kthread.h>
59 #include <linux/delay.h>
60 
61 #include <linux/atomic.h>
62 
63 /*
64  * pidlists linger the following amount before being destroyed.  The goal
65  * is avoiding frequent destruction in the middle of consecutive read calls
66  * Expiring in the middle is a performance problem not a correctness one.
67  * 1 sec should be enough.
68  */
69 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
70 
71 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
72 					 MAX_CFTYPE_NAME + 2)
73 
74 /*
75  * cgroup_mutex is the master lock.  Any modification to cgroup or its
76  * hierarchy must be performed while holding it.
77  *
78  * css_set_rwsem protects task->cgroups pointer, the list of css_set
79  * objects, and the chain of tasks off each css_set.
80  *
81  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
82  * cgroup.h can use them for lockdep annotations.
83  */
84 #ifdef CONFIG_PROVE_RCU
85 DEFINE_MUTEX(cgroup_mutex);
86 DECLARE_RWSEM(css_set_rwsem);
87 EXPORT_SYMBOL_GPL(cgroup_mutex);
88 EXPORT_SYMBOL_GPL(css_set_rwsem);
89 #else
90 static DEFINE_MUTEX(cgroup_mutex);
91 static DECLARE_RWSEM(css_set_rwsem);
92 #endif
93 
94 /*
95  * Protects cgroup_idr and css_idr so that IDs can be released without
96  * grabbing cgroup_mutex.
97  */
98 static DEFINE_SPINLOCK(cgroup_idr_lock);
99 
100 /*
101  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
102  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
103  */
104 static DEFINE_SPINLOCK(release_agent_path_lock);
105 
106 #define cgroup_assert_mutex_or_rcu_locked()				\
107 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
108 			   lockdep_is_held(&cgroup_mutex),		\
109 			   "cgroup_mutex or RCU read lock required");
110 
111 /*
112  * cgroup destruction makes heavy use of work items and there can be a lot
113  * of concurrent destructions.  Use a separate workqueue so that cgroup
114  * destruction work items don't end up filling up max_active of system_wq
115  * which may lead to deadlock.
116  */
117 static struct workqueue_struct *cgroup_destroy_wq;
118 
119 /*
120  * pidlist destructions need to be flushed on cgroup destruction.  Use a
121  * separate workqueue as flush domain.
122  */
123 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
124 
125 /* generate an array of cgroup subsystem pointers */
126 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
127 static struct cgroup_subsys *cgroup_subsys[] = {
128 #include <linux/cgroup_subsys.h>
129 };
130 #undef SUBSYS
131 
132 /* array of cgroup subsystem names */
133 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
134 static const char *cgroup_subsys_name[] = {
135 #include <linux/cgroup_subsys.h>
136 };
137 #undef SUBSYS
138 
139 /*
140  * The default hierarchy, reserved for the subsystems that are otherwise
141  * unattached - it never has more than a single cgroup, and all tasks are
142  * part of that cgroup.
143  */
144 struct cgroup_root cgrp_dfl_root;
145 
146 /*
147  * The default hierarchy always exists but is hidden until mounted for the
148  * first time.  This is for backward compatibility.
149  */
150 static bool cgrp_dfl_root_visible;
151 
152 /*
153  * Set by the boot param of the same name and makes subsystems with NULL
154  * ->dfl_files to use ->legacy_files on the default hierarchy.
155  */
156 static bool cgroup_legacy_files_on_dfl;
157 
158 /* some controllers are not supported in the default hierarchy */
159 static unsigned int cgrp_dfl_root_inhibit_ss_mask;
160 
161 /* The list of hierarchy roots */
162 
163 static LIST_HEAD(cgroup_roots);
164 static int cgroup_root_count;
165 
166 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
167 static DEFINE_IDR(cgroup_hierarchy_idr);
168 
169 /*
170  * Assign a monotonically increasing serial number to csses.  It guarantees
171  * cgroups with bigger numbers are newer than those with smaller numbers.
172  * Also, as csses are always appended to the parent's ->children list, it
173  * guarantees that sibling csses are always sorted in the ascending serial
174  * number order on the list.  Protected by cgroup_mutex.
175  */
176 static u64 css_serial_nr_next = 1;
177 
178 /* This flag indicates whether tasks in the fork and exit paths should
179  * check for fork/exit handlers to call. This avoids us having to do
180  * extra work in the fork/exit path if none of the subsystems need to
181  * be called.
182  */
183 static int need_forkexit_callback __read_mostly;
184 
185 static struct cftype cgroup_dfl_base_files[];
186 static struct cftype cgroup_legacy_base_files[];
187 
188 static int rebind_subsystems(struct cgroup_root *dst_root,
189 			     unsigned int ss_mask);
190 static int cgroup_destroy_locked(struct cgroup *cgrp);
191 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
192 		      bool visible);
193 static void css_release(struct percpu_ref *ref);
194 static void kill_css(struct cgroup_subsys_state *css);
195 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
196 			      bool is_add);
197 
198 /* IDR wrappers which synchronize using cgroup_idr_lock */
cgroup_idr_alloc(struct idr * idr,void * ptr,int start,int end,gfp_t gfp_mask)199 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
200 			    gfp_t gfp_mask)
201 {
202 	int ret;
203 
204 	idr_preload(gfp_mask);
205 	spin_lock_bh(&cgroup_idr_lock);
206 	ret = idr_alloc(idr, ptr, start, end, gfp_mask);
207 	spin_unlock_bh(&cgroup_idr_lock);
208 	idr_preload_end();
209 	return ret;
210 }
211 
cgroup_idr_replace(struct idr * idr,void * ptr,int id)212 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
213 {
214 	void *ret;
215 
216 	spin_lock_bh(&cgroup_idr_lock);
217 	ret = idr_replace(idr, ptr, id);
218 	spin_unlock_bh(&cgroup_idr_lock);
219 	return ret;
220 }
221 
cgroup_idr_remove(struct idr * idr,int id)222 static void cgroup_idr_remove(struct idr *idr, int id)
223 {
224 	spin_lock_bh(&cgroup_idr_lock);
225 	idr_remove(idr, id);
226 	spin_unlock_bh(&cgroup_idr_lock);
227 }
228 
cgroup_parent(struct cgroup * cgrp)229 static struct cgroup *cgroup_parent(struct cgroup *cgrp)
230 {
231 	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
232 
233 	if (parent_css)
234 		return container_of(parent_css, struct cgroup, self);
235 	return NULL;
236 }
237 
238 /**
239  * cgroup_css - obtain a cgroup's css for the specified subsystem
240  * @cgrp: the cgroup of interest
241  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
242  *
243  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
244  * function must be called either under cgroup_mutex or rcu_read_lock() and
245  * the caller is responsible for pinning the returned css if it wants to
246  * keep accessing it outside the said locks.  This function may return
247  * %NULL if @cgrp doesn't have @subsys_id enabled.
248  */
cgroup_css(struct cgroup * cgrp,struct cgroup_subsys * ss)249 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
250 					      struct cgroup_subsys *ss)
251 {
252 	if (ss)
253 		return rcu_dereference_check(cgrp->subsys[ss->id],
254 					lockdep_is_held(&cgroup_mutex));
255 	else
256 		return &cgrp->self;
257 }
258 
259 /**
260  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
261  * @cgrp: the cgroup of interest
262  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
263  *
264  * Similar to cgroup_css() but returns the effctive css, which is defined
265  * as the matching css of the nearest ancestor including self which has @ss
266  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
267  * function is guaranteed to return non-NULL css.
268  */
cgroup_e_css(struct cgroup * cgrp,struct cgroup_subsys * ss)269 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
270 						struct cgroup_subsys *ss)
271 {
272 	lockdep_assert_held(&cgroup_mutex);
273 
274 	if (!ss)
275 		return &cgrp->self;
276 
277 	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
278 		return NULL;
279 
280 	/*
281 	 * This function is used while updating css associations and thus
282 	 * can't test the csses directly.  Use ->child_subsys_mask.
283 	 */
284 	while (cgroup_parent(cgrp) &&
285 	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
286 		cgrp = cgroup_parent(cgrp);
287 
288 	return cgroup_css(cgrp, ss);
289 }
290 
291 /**
292  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
293  * @cgrp: the cgroup of interest
294  * @ss: the subsystem of interest
295  *
296  * Find and get the effective css of @cgrp for @ss.  The effective css is
297  * defined as the matching css of the nearest ancestor including self which
298  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
299  * the root css is returned, so this function always returns a valid css.
300  * The returned css must be put using css_put().
301  */
cgroup_get_e_css(struct cgroup * cgrp,struct cgroup_subsys * ss)302 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
303 					     struct cgroup_subsys *ss)
304 {
305 	struct cgroup_subsys_state *css;
306 
307 	rcu_read_lock();
308 
309 	do {
310 		css = cgroup_css(cgrp, ss);
311 
312 		if (css && css_tryget_online(css))
313 			goto out_unlock;
314 		cgrp = cgroup_parent(cgrp);
315 	} while (cgrp);
316 
317 	css = init_css_set.subsys[ss->id];
318 	css_get(css);
319 out_unlock:
320 	rcu_read_unlock();
321 	return css;
322 }
323 
324 /* convenient tests for these bits */
cgroup_is_dead(const struct cgroup * cgrp)325 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
326 {
327 	return !(cgrp->self.flags & CSS_ONLINE);
328 }
329 
of_css(struct kernfs_open_file * of)330 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
331 {
332 	struct cgroup *cgrp = of->kn->parent->priv;
333 	struct cftype *cft = of_cft(of);
334 
335 	/*
336 	 * This is open and unprotected implementation of cgroup_css().
337 	 * seq_css() is only called from a kernfs file operation which has
338 	 * an active reference on the file.  Because all the subsystem
339 	 * files are drained before a css is disassociated with a cgroup,
340 	 * the matching css from the cgroup's subsys table is guaranteed to
341 	 * be and stay valid until the enclosing operation is complete.
342 	 */
343 	if (cft->ss)
344 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
345 	else
346 		return &cgrp->self;
347 }
348 EXPORT_SYMBOL_GPL(of_css);
349 
350 /**
351  * cgroup_is_descendant - test ancestry
352  * @cgrp: the cgroup to be tested
353  * @ancestor: possible ancestor of @cgrp
354  *
355  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
356  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
357  * and @ancestor are accessible.
358  */
cgroup_is_descendant(struct cgroup * cgrp,struct cgroup * ancestor)359 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
360 {
361 	while (cgrp) {
362 		if (cgrp == ancestor)
363 			return true;
364 		cgrp = cgroup_parent(cgrp);
365 	}
366 	return false;
367 }
368 
notify_on_release(const struct cgroup * cgrp)369 static int notify_on_release(const struct cgroup *cgrp)
370 {
371 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
372 }
373 
374 /**
375  * for_each_css - iterate all css's of a cgroup
376  * @css: the iteration cursor
377  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
378  * @cgrp: the target cgroup to iterate css's of
379  *
380  * Should be called under cgroup_[tree_]mutex.
381  */
382 #define for_each_css(css, ssid, cgrp)					\
383 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
384 		if (!((css) = rcu_dereference_check(			\
385 				(cgrp)->subsys[(ssid)],			\
386 				lockdep_is_held(&cgroup_mutex)))) { }	\
387 		else
388 
389 /**
390  * for_each_e_css - iterate all effective css's of a cgroup
391  * @css: the iteration cursor
392  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
393  * @cgrp: the target cgroup to iterate css's of
394  *
395  * Should be called under cgroup_[tree_]mutex.
396  */
397 #define for_each_e_css(css, ssid, cgrp)					\
398 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
399 		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
400 			;						\
401 		else
402 
403 /**
404  * for_each_subsys - iterate all enabled cgroup subsystems
405  * @ss: the iteration cursor
406  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
407  */
408 #define for_each_subsys(ss, ssid)					\
409 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
410 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
411 
412 /* iterate across the hierarchies */
413 #define for_each_root(root)						\
414 	list_for_each_entry((root), &cgroup_roots, root_list)
415 
416 /* iterate over child cgrps, lock should be held throughout iteration */
417 #define cgroup_for_each_live_child(child, cgrp)				\
418 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
419 		if (({ lockdep_assert_held(&cgroup_mutex);		\
420 		       cgroup_is_dead(child); }))			\
421 			;						\
422 		else
423 
424 static void cgroup_release_agent(struct work_struct *work);
425 static void check_for_release(struct cgroup *cgrp);
426 
427 /*
428  * A cgroup can be associated with multiple css_sets as different tasks may
429  * belong to different cgroups on different hierarchies.  In the other
430  * direction, a css_set is naturally associated with multiple cgroups.
431  * This M:N relationship is represented by the following link structure
432  * which exists for each association and allows traversing the associations
433  * from both sides.
434  */
435 struct cgrp_cset_link {
436 	/* the cgroup and css_set this link associates */
437 	struct cgroup		*cgrp;
438 	struct css_set		*cset;
439 
440 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
441 	struct list_head	cset_link;
442 
443 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
444 	struct list_head	cgrp_link;
445 };
446 
447 /*
448  * The default css_set - used by init and its children prior to any
449  * hierarchies being mounted. It contains a pointer to the root state
450  * for each subsystem. Also used to anchor the list of css_sets. Not
451  * reference-counted, to improve performance when child cgroups
452  * haven't been created.
453  */
454 struct css_set init_css_set = {
455 	.refcount		= ATOMIC_INIT(1),
456 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
457 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
458 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
459 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
460 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
461 };
462 
463 static int css_set_count	= 1;	/* 1 for init_css_set */
464 
465 /**
466  * cgroup_update_populated - updated populated count of a cgroup
467  * @cgrp: the target cgroup
468  * @populated: inc or dec populated count
469  *
470  * @cgrp is either getting the first task (css_set) or losing the last.
471  * Update @cgrp->populated_cnt accordingly.  The count is propagated
472  * towards root so that a given cgroup's populated_cnt is zero iff the
473  * cgroup and all its descendants are empty.
474  *
475  * @cgrp's interface file "cgroup.populated" is zero if
476  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
477  * changes from or to zero, userland is notified that the content of the
478  * interface file has changed.  This can be used to detect when @cgrp and
479  * its descendants become populated or empty.
480  */
cgroup_update_populated(struct cgroup * cgrp,bool populated)481 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
482 {
483 	lockdep_assert_held(&css_set_rwsem);
484 
485 	do {
486 		bool trigger;
487 
488 		if (populated)
489 			trigger = !cgrp->populated_cnt++;
490 		else
491 			trigger = !--cgrp->populated_cnt;
492 
493 		if (!trigger)
494 			break;
495 
496 		if (cgrp->populated_kn)
497 			kernfs_notify(cgrp->populated_kn);
498 		cgrp = cgroup_parent(cgrp);
499 	} while (cgrp);
500 }
501 
502 /*
503  * hash table for cgroup groups. This improves the performance to find
504  * an existing css_set. This hash doesn't (currently) take into
505  * account cgroups in empty hierarchies.
506  */
507 #define CSS_SET_HASH_BITS	7
508 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
509 
css_set_hash(struct cgroup_subsys_state * css[])510 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
511 {
512 	unsigned long key = 0UL;
513 	struct cgroup_subsys *ss;
514 	int i;
515 
516 	for_each_subsys(ss, i)
517 		key += (unsigned long)css[i];
518 	key = (key >> 16) ^ key;
519 
520 	return key;
521 }
522 
put_css_set_locked(struct css_set * cset)523 static void put_css_set_locked(struct css_set *cset)
524 {
525 	struct cgrp_cset_link *link, *tmp_link;
526 	struct cgroup_subsys *ss;
527 	int ssid;
528 
529 	lockdep_assert_held(&css_set_rwsem);
530 
531 	if (!atomic_dec_and_test(&cset->refcount))
532 		return;
533 
534 	/* This css_set is dead. unlink it and release cgroup refcounts */
535 	for_each_subsys(ss, ssid)
536 		list_del(&cset->e_cset_node[ssid]);
537 	hash_del(&cset->hlist);
538 	css_set_count--;
539 
540 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
541 		struct cgroup *cgrp = link->cgrp;
542 
543 		list_del(&link->cset_link);
544 		list_del(&link->cgrp_link);
545 
546 		/* @cgrp can't go away while we're holding css_set_rwsem */
547 		if (list_empty(&cgrp->cset_links)) {
548 			cgroup_update_populated(cgrp, false);
549 			check_for_release(cgrp);
550 		}
551 
552 		kfree(link);
553 	}
554 
555 	kfree_rcu(cset, rcu_head);
556 }
557 
put_css_set(struct css_set * cset)558 static void put_css_set(struct css_set *cset)
559 {
560 	/*
561 	 * Ensure that the refcount doesn't hit zero while any readers
562 	 * can see it. Similar to atomic_dec_and_lock(), but for an
563 	 * rwlock
564 	 */
565 	if (atomic_add_unless(&cset->refcount, -1, 1))
566 		return;
567 
568 	down_write(&css_set_rwsem);
569 	put_css_set_locked(cset);
570 	up_write(&css_set_rwsem);
571 }
572 
573 /*
574  * refcounted get/put for css_set objects
575  */
get_css_set(struct css_set * cset)576 static inline void get_css_set(struct css_set *cset)
577 {
578 	atomic_inc(&cset->refcount);
579 }
580 
581 /**
582  * compare_css_sets - helper function for find_existing_css_set().
583  * @cset: candidate css_set being tested
584  * @old_cset: existing css_set for a task
585  * @new_cgrp: cgroup that's being entered by the task
586  * @template: desired set of css pointers in css_set (pre-calculated)
587  *
588  * Returns true if "cset" matches "old_cset" except for the hierarchy
589  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
590  */
compare_css_sets(struct css_set * cset,struct css_set * old_cset,struct cgroup * new_cgrp,struct cgroup_subsys_state * template[])591 static bool compare_css_sets(struct css_set *cset,
592 			     struct css_set *old_cset,
593 			     struct cgroup *new_cgrp,
594 			     struct cgroup_subsys_state *template[])
595 {
596 	struct list_head *l1, *l2;
597 
598 	/*
599 	 * On the default hierarchy, there can be csets which are
600 	 * associated with the same set of cgroups but different csses.
601 	 * Let's first ensure that csses match.
602 	 */
603 	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
604 		return false;
605 
606 	/*
607 	 * Compare cgroup pointers in order to distinguish between
608 	 * different cgroups in hierarchies.  As different cgroups may
609 	 * share the same effective css, this comparison is always
610 	 * necessary.
611 	 */
612 	l1 = &cset->cgrp_links;
613 	l2 = &old_cset->cgrp_links;
614 	while (1) {
615 		struct cgrp_cset_link *link1, *link2;
616 		struct cgroup *cgrp1, *cgrp2;
617 
618 		l1 = l1->next;
619 		l2 = l2->next;
620 		/* See if we reached the end - both lists are equal length. */
621 		if (l1 == &cset->cgrp_links) {
622 			BUG_ON(l2 != &old_cset->cgrp_links);
623 			break;
624 		} else {
625 			BUG_ON(l2 == &old_cset->cgrp_links);
626 		}
627 		/* Locate the cgroups associated with these links. */
628 		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
629 		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
630 		cgrp1 = link1->cgrp;
631 		cgrp2 = link2->cgrp;
632 		/* Hierarchies should be linked in the same order. */
633 		BUG_ON(cgrp1->root != cgrp2->root);
634 
635 		/*
636 		 * If this hierarchy is the hierarchy of the cgroup
637 		 * that's changing, then we need to check that this
638 		 * css_set points to the new cgroup; if it's any other
639 		 * hierarchy, then this css_set should point to the
640 		 * same cgroup as the old css_set.
641 		 */
642 		if (cgrp1->root == new_cgrp->root) {
643 			if (cgrp1 != new_cgrp)
644 				return false;
645 		} else {
646 			if (cgrp1 != cgrp2)
647 				return false;
648 		}
649 	}
650 	return true;
651 }
652 
653 /**
654  * find_existing_css_set - init css array and find the matching css_set
655  * @old_cset: the css_set that we're using before the cgroup transition
656  * @cgrp: the cgroup that we're moving into
657  * @template: out param for the new set of csses, should be clear on entry
658  */
find_existing_css_set(struct css_set * old_cset,struct cgroup * cgrp,struct cgroup_subsys_state * template[])659 static struct css_set *find_existing_css_set(struct css_set *old_cset,
660 					struct cgroup *cgrp,
661 					struct cgroup_subsys_state *template[])
662 {
663 	struct cgroup_root *root = cgrp->root;
664 	struct cgroup_subsys *ss;
665 	struct css_set *cset;
666 	unsigned long key;
667 	int i;
668 
669 	/*
670 	 * Build the set of subsystem state objects that we want to see in the
671 	 * new css_set. while subsystems can change globally, the entries here
672 	 * won't change, so no need for locking.
673 	 */
674 	for_each_subsys(ss, i) {
675 		if (root->subsys_mask & (1UL << i)) {
676 			/*
677 			 * @ss is in this hierarchy, so we want the
678 			 * effective css from @cgrp.
679 			 */
680 			template[i] = cgroup_e_css(cgrp, ss);
681 		} else {
682 			/*
683 			 * @ss is not in this hierarchy, so we don't want
684 			 * to change the css.
685 			 */
686 			template[i] = old_cset->subsys[i];
687 		}
688 	}
689 
690 	key = css_set_hash(template);
691 	hash_for_each_possible(css_set_table, cset, hlist, key) {
692 		if (!compare_css_sets(cset, old_cset, cgrp, template))
693 			continue;
694 
695 		/* This css_set matches what we need */
696 		return cset;
697 	}
698 
699 	/* No existing cgroup group matched */
700 	return NULL;
701 }
702 
free_cgrp_cset_links(struct list_head * links_to_free)703 static void free_cgrp_cset_links(struct list_head *links_to_free)
704 {
705 	struct cgrp_cset_link *link, *tmp_link;
706 
707 	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
708 		list_del(&link->cset_link);
709 		kfree(link);
710 	}
711 }
712 
713 /**
714  * allocate_cgrp_cset_links - allocate cgrp_cset_links
715  * @count: the number of links to allocate
716  * @tmp_links: list_head the allocated links are put on
717  *
718  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
719  * through ->cset_link.  Returns 0 on success or -errno.
720  */
allocate_cgrp_cset_links(int count,struct list_head * tmp_links)721 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
722 {
723 	struct cgrp_cset_link *link;
724 	int i;
725 
726 	INIT_LIST_HEAD(tmp_links);
727 
728 	for (i = 0; i < count; i++) {
729 		link = kzalloc(sizeof(*link), GFP_KERNEL);
730 		if (!link) {
731 			free_cgrp_cset_links(tmp_links);
732 			return -ENOMEM;
733 		}
734 		list_add(&link->cset_link, tmp_links);
735 	}
736 	return 0;
737 }
738 
739 /**
740  * link_css_set - a helper function to link a css_set to a cgroup
741  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
742  * @cset: the css_set to be linked
743  * @cgrp: the destination cgroup
744  */
link_css_set(struct list_head * tmp_links,struct css_set * cset,struct cgroup * cgrp)745 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
746 			 struct cgroup *cgrp)
747 {
748 	struct cgrp_cset_link *link;
749 
750 	BUG_ON(list_empty(tmp_links));
751 
752 	if (cgroup_on_dfl(cgrp))
753 		cset->dfl_cgrp = cgrp;
754 
755 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
756 	link->cset = cset;
757 	link->cgrp = cgrp;
758 
759 	if (list_empty(&cgrp->cset_links))
760 		cgroup_update_populated(cgrp, true);
761 	list_move(&link->cset_link, &cgrp->cset_links);
762 
763 	/*
764 	 * Always add links to the tail of the list so that the list
765 	 * is sorted by order of hierarchy creation
766 	 */
767 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
768 }
769 
770 /**
771  * find_css_set - return a new css_set with one cgroup updated
772  * @old_cset: the baseline css_set
773  * @cgrp: the cgroup to be updated
774  *
775  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
776  * substituted into the appropriate hierarchy.
777  */
find_css_set(struct css_set * old_cset,struct cgroup * cgrp)778 static struct css_set *find_css_set(struct css_set *old_cset,
779 				    struct cgroup *cgrp)
780 {
781 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
782 	struct css_set *cset;
783 	struct list_head tmp_links;
784 	struct cgrp_cset_link *link;
785 	struct cgroup_subsys *ss;
786 	unsigned long key;
787 	int ssid;
788 
789 	lockdep_assert_held(&cgroup_mutex);
790 
791 	/* First see if we already have a cgroup group that matches
792 	 * the desired set */
793 	down_read(&css_set_rwsem);
794 	cset = find_existing_css_set(old_cset, cgrp, template);
795 	if (cset)
796 		get_css_set(cset);
797 	up_read(&css_set_rwsem);
798 
799 	if (cset)
800 		return cset;
801 
802 	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
803 	if (!cset)
804 		return NULL;
805 
806 	/* Allocate all the cgrp_cset_link objects that we'll need */
807 	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
808 		kfree(cset);
809 		return NULL;
810 	}
811 
812 	atomic_set(&cset->refcount, 1);
813 	INIT_LIST_HEAD(&cset->cgrp_links);
814 	INIT_LIST_HEAD(&cset->tasks);
815 	INIT_LIST_HEAD(&cset->mg_tasks);
816 	INIT_LIST_HEAD(&cset->mg_preload_node);
817 	INIT_LIST_HEAD(&cset->mg_node);
818 	INIT_HLIST_NODE(&cset->hlist);
819 
820 	/* Copy the set of subsystem state objects generated in
821 	 * find_existing_css_set() */
822 	memcpy(cset->subsys, template, sizeof(cset->subsys));
823 
824 	down_write(&css_set_rwsem);
825 	/* Add reference counts and links from the new css_set. */
826 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
827 		struct cgroup *c = link->cgrp;
828 
829 		if (c->root == cgrp->root)
830 			c = cgrp;
831 		link_css_set(&tmp_links, cset, c);
832 	}
833 
834 	BUG_ON(!list_empty(&tmp_links));
835 
836 	css_set_count++;
837 
838 	/* Add @cset to the hash table */
839 	key = css_set_hash(cset->subsys);
840 	hash_add(css_set_table, &cset->hlist, key);
841 
842 	for_each_subsys(ss, ssid)
843 		list_add_tail(&cset->e_cset_node[ssid],
844 			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
845 
846 	up_write(&css_set_rwsem);
847 
848 	return cset;
849 }
850 
cgroup_root_from_kf(struct kernfs_root * kf_root)851 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
852 {
853 	struct cgroup *root_cgrp = kf_root->kn->priv;
854 
855 	return root_cgrp->root;
856 }
857 
cgroup_init_root_id(struct cgroup_root * root)858 static int cgroup_init_root_id(struct cgroup_root *root)
859 {
860 	int id;
861 
862 	lockdep_assert_held(&cgroup_mutex);
863 
864 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
865 	if (id < 0)
866 		return id;
867 
868 	root->hierarchy_id = id;
869 	return 0;
870 }
871 
cgroup_exit_root_id(struct cgroup_root * root)872 static void cgroup_exit_root_id(struct cgroup_root *root)
873 {
874 	lockdep_assert_held(&cgroup_mutex);
875 
876 	if (root->hierarchy_id) {
877 		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
878 		root->hierarchy_id = 0;
879 	}
880 }
881 
cgroup_free_root(struct cgroup_root * root)882 static void cgroup_free_root(struct cgroup_root *root)
883 {
884 	if (root) {
885 		/* hierarhcy ID shoulid already have been released */
886 		WARN_ON_ONCE(root->hierarchy_id);
887 
888 		idr_destroy(&root->cgroup_idr);
889 		kfree(root);
890 	}
891 }
892 
cgroup_destroy_root(struct cgroup_root * root)893 static void cgroup_destroy_root(struct cgroup_root *root)
894 {
895 	struct cgroup *cgrp = &root->cgrp;
896 	struct cgrp_cset_link *link, *tmp_link;
897 
898 	mutex_lock(&cgroup_mutex);
899 
900 	BUG_ON(atomic_read(&root->nr_cgrps));
901 	BUG_ON(!list_empty(&cgrp->self.children));
902 
903 	/* Rebind all subsystems back to the default hierarchy */
904 	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
905 
906 	/*
907 	 * Release all the links from cset_links to this hierarchy's
908 	 * root cgroup
909 	 */
910 	down_write(&css_set_rwsem);
911 
912 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
913 		list_del(&link->cset_link);
914 		list_del(&link->cgrp_link);
915 		kfree(link);
916 	}
917 	up_write(&css_set_rwsem);
918 
919 	if (!list_empty(&root->root_list)) {
920 		list_del(&root->root_list);
921 		cgroup_root_count--;
922 	}
923 
924 	cgroup_exit_root_id(root);
925 
926 	mutex_unlock(&cgroup_mutex);
927 
928 	kernfs_destroy_root(root->kf_root);
929 	cgroup_free_root(root);
930 }
931 
932 /* look up cgroup associated with given css_set on the specified hierarchy */
cset_cgroup_from_root(struct css_set * cset,struct cgroup_root * root)933 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
934 					    struct cgroup_root *root)
935 {
936 	struct cgroup *res = NULL;
937 
938 	lockdep_assert_held(&cgroup_mutex);
939 	lockdep_assert_held(&css_set_rwsem);
940 
941 	if (cset == &init_css_set) {
942 		res = &root->cgrp;
943 	} else {
944 		struct cgrp_cset_link *link;
945 
946 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
947 			struct cgroup *c = link->cgrp;
948 
949 			if (c->root == root) {
950 				res = c;
951 				break;
952 			}
953 		}
954 	}
955 
956 	BUG_ON(!res);
957 	return res;
958 }
959 
960 /*
961  * Return the cgroup for "task" from the given hierarchy. Must be
962  * called with cgroup_mutex and css_set_rwsem held.
963  */
task_cgroup_from_root(struct task_struct * task,struct cgroup_root * root)964 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
965 					    struct cgroup_root *root)
966 {
967 	/*
968 	 * No need to lock the task - since we hold cgroup_mutex the
969 	 * task can't change groups, so the only thing that can happen
970 	 * is that it exits and its css is set back to init_css_set.
971 	 */
972 	return cset_cgroup_from_root(task_css_set(task), root);
973 }
974 
975 /*
976  * A task must hold cgroup_mutex to modify cgroups.
977  *
978  * Any task can increment and decrement the count field without lock.
979  * So in general, code holding cgroup_mutex can't rely on the count
980  * field not changing.  However, if the count goes to zero, then only
981  * cgroup_attach_task() can increment it again.  Because a count of zero
982  * means that no tasks are currently attached, therefore there is no
983  * way a task attached to that cgroup can fork (the other way to
984  * increment the count).  So code holding cgroup_mutex can safely
985  * assume that if the count is zero, it will stay zero. Similarly, if
986  * a task holds cgroup_mutex on a cgroup with zero count, it
987  * knows that the cgroup won't be removed, as cgroup_rmdir()
988  * needs that mutex.
989  *
990  * A cgroup can only be deleted if both its 'count' of using tasks
991  * is zero, and its list of 'children' cgroups is empty.  Since all
992  * tasks in the system use _some_ cgroup, and since there is always at
993  * least one task in the system (init, pid == 1), therefore, root cgroup
994  * always has either children cgroups and/or using tasks.  So we don't
995  * need a special hack to ensure that root cgroup cannot be deleted.
996  *
997  * P.S.  One more locking exception.  RCU is used to guard the
998  * update of a tasks cgroup pointer by cgroup_attach_task()
999  */
1000 
1001 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
1002 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1003 static const struct file_operations proc_cgroupstats_operations;
1004 
cgroup_file_name(struct cgroup * cgrp,const struct cftype * cft,char * buf)1005 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1006 			      char *buf)
1007 {
1008 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1009 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1010 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1011 			 cft->ss->name, cft->name);
1012 	else
1013 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1014 	return buf;
1015 }
1016 
1017 /**
1018  * cgroup_file_mode - deduce file mode of a control file
1019  * @cft: the control file in question
1020  *
1021  * returns cft->mode if ->mode is not 0
1022  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1023  * returns S_IRUGO if it has only a read handler
1024  * returns S_IWUSR if it has only a write hander
1025  */
cgroup_file_mode(const struct cftype * cft)1026 static umode_t cgroup_file_mode(const struct cftype *cft)
1027 {
1028 	umode_t mode = 0;
1029 
1030 	if (cft->mode)
1031 		return cft->mode;
1032 
1033 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1034 		mode |= S_IRUGO;
1035 
1036 	if (cft->write_u64 || cft->write_s64 || cft->write)
1037 		mode |= S_IWUSR;
1038 
1039 	return mode;
1040 }
1041 
cgroup_get(struct cgroup * cgrp)1042 static void cgroup_get(struct cgroup *cgrp)
1043 {
1044 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
1045 	css_get(&cgrp->self);
1046 }
1047 
cgroup_tryget(struct cgroup * cgrp)1048 static bool cgroup_tryget(struct cgroup *cgrp)
1049 {
1050 	return css_tryget(&cgrp->self);
1051 }
1052 
cgroup_put(struct cgroup * cgrp)1053 static void cgroup_put(struct cgroup *cgrp)
1054 {
1055 	css_put(&cgrp->self);
1056 }
1057 
1058 /**
1059  * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1060  * @cgrp: the target cgroup
1061  * @subtree_control: the new subtree_control mask to consider
1062  *
1063  * On the default hierarchy, a subsystem may request other subsystems to be
1064  * enabled together through its ->depends_on mask.  In such cases, more
1065  * subsystems than specified in "cgroup.subtree_control" may be enabled.
1066  *
1067  * This function calculates which subsystems need to be enabled if
1068  * @subtree_control is to be applied to @cgrp.  The returned mask is always
1069  * a superset of @subtree_control and follows the usual hierarchy rules.
1070  */
cgroup_calc_child_subsys_mask(struct cgroup * cgrp,unsigned int subtree_control)1071 static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072 						  unsigned int subtree_control)
1073 {
1074 	struct cgroup *parent = cgroup_parent(cgrp);
1075 	unsigned int cur_ss_mask = subtree_control;
1076 	struct cgroup_subsys *ss;
1077 	int ssid;
1078 
1079 	lockdep_assert_held(&cgroup_mutex);
1080 
1081 	if (!cgroup_on_dfl(cgrp))
1082 		return cur_ss_mask;
1083 
1084 	while (true) {
1085 		unsigned int new_ss_mask = cur_ss_mask;
1086 
1087 		for_each_subsys(ss, ssid)
1088 			if (cur_ss_mask & (1 << ssid))
1089 				new_ss_mask |= ss->depends_on;
1090 
1091 		/*
1092 		 * Mask out subsystems which aren't available.  This can
1093 		 * happen only if some depended-upon subsystems were bound
1094 		 * to non-default hierarchies.
1095 		 */
1096 		if (parent)
1097 			new_ss_mask &= parent->child_subsys_mask;
1098 		else
1099 			new_ss_mask &= cgrp->root->subsys_mask;
1100 
1101 		if (new_ss_mask == cur_ss_mask)
1102 			break;
1103 		cur_ss_mask = new_ss_mask;
1104 	}
1105 
1106 	return cur_ss_mask;
1107 }
1108 
1109 /**
1110  * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1111  * @cgrp: the target cgroup
1112  *
1113  * Update @cgrp->child_subsys_mask according to the current
1114  * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
1115  */
cgroup_refresh_child_subsys_mask(struct cgroup * cgrp)1116 static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1117 {
1118 	cgrp->child_subsys_mask =
1119 		cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
1120 }
1121 
1122 /**
1123  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1124  * @kn: the kernfs_node being serviced
1125  *
1126  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1127  * the method finishes if locking succeeded.  Note that once this function
1128  * returns the cgroup returned by cgroup_kn_lock_live() may become
1129  * inaccessible any time.  If the caller intends to continue to access the
1130  * cgroup, it should pin it before invoking this function.
1131  */
cgroup_kn_unlock(struct kernfs_node * kn)1132 static void cgroup_kn_unlock(struct kernfs_node *kn)
1133 {
1134 	struct cgroup *cgrp;
1135 
1136 	if (kernfs_type(kn) == KERNFS_DIR)
1137 		cgrp = kn->priv;
1138 	else
1139 		cgrp = kn->parent->priv;
1140 
1141 	mutex_unlock(&cgroup_mutex);
1142 
1143 	kernfs_unbreak_active_protection(kn);
1144 	cgroup_put(cgrp);
1145 }
1146 
1147 /**
1148  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1149  * @kn: the kernfs_node being serviced
1150  *
1151  * This helper is to be used by a cgroup kernfs method currently servicing
1152  * @kn.  It breaks the active protection, performs cgroup locking and
1153  * verifies that the associated cgroup is alive.  Returns the cgroup if
1154  * alive; otherwise, %NULL.  A successful return should be undone by a
1155  * matching cgroup_kn_unlock() invocation.
1156  *
1157  * Any cgroup kernfs method implementation which requires locking the
1158  * associated cgroup should use this helper.  It avoids nesting cgroup
1159  * locking under kernfs active protection and allows all kernfs operations
1160  * including self-removal.
1161  */
cgroup_kn_lock_live(struct kernfs_node * kn)1162 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1163 {
1164 	struct cgroup *cgrp;
1165 
1166 	if (kernfs_type(kn) == KERNFS_DIR)
1167 		cgrp = kn->priv;
1168 	else
1169 		cgrp = kn->parent->priv;
1170 
1171 	/*
1172 	 * We're gonna grab cgroup_mutex which nests outside kernfs
1173 	 * active_ref.  cgroup liveliness check alone provides enough
1174 	 * protection against removal.  Ensure @cgrp stays accessible and
1175 	 * break the active_ref protection.
1176 	 */
1177 	if (!cgroup_tryget(cgrp))
1178 		return NULL;
1179 	kernfs_break_active_protection(kn);
1180 
1181 	mutex_lock(&cgroup_mutex);
1182 
1183 	if (!cgroup_is_dead(cgrp))
1184 		return cgrp;
1185 
1186 	cgroup_kn_unlock(kn);
1187 	return NULL;
1188 }
1189 
cgroup_rm_file(struct cgroup * cgrp,const struct cftype * cft)1190 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1191 {
1192 	char name[CGROUP_FILE_NAME_MAX];
1193 
1194 	lockdep_assert_held(&cgroup_mutex);
1195 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1196 }
1197 
1198 /**
1199  * cgroup_clear_dir - remove subsys files in a cgroup directory
1200  * @cgrp: target cgroup
1201  * @subsys_mask: mask of the subsystem ids whose files should be removed
1202  */
cgroup_clear_dir(struct cgroup * cgrp,unsigned int subsys_mask)1203 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1204 {
1205 	struct cgroup_subsys *ss;
1206 	int i;
1207 
1208 	for_each_subsys(ss, i) {
1209 		struct cftype *cfts;
1210 
1211 		if (!(subsys_mask & (1 << i)))
1212 			continue;
1213 		list_for_each_entry(cfts, &ss->cfts, node)
1214 			cgroup_addrm_files(cgrp, cfts, false);
1215 	}
1216 }
1217 
rebind_subsystems(struct cgroup_root * dst_root,unsigned int ss_mask)1218 static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1219 {
1220 	struct cgroup_subsys *ss;
1221 	unsigned int tmp_ss_mask;
1222 	int ssid, i, ret;
1223 
1224 	lockdep_assert_held(&cgroup_mutex);
1225 
1226 	for_each_subsys(ss, ssid) {
1227 		if (!(ss_mask & (1 << ssid)))
1228 			continue;
1229 
1230 		/* if @ss has non-root csses attached to it, can't move */
1231 		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1232 			return -EBUSY;
1233 
1234 		/* can't move between two non-dummy roots either */
1235 		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1236 			return -EBUSY;
1237 	}
1238 
1239 	/* skip creating root files on dfl_root for inhibited subsystems */
1240 	tmp_ss_mask = ss_mask;
1241 	if (dst_root == &cgrp_dfl_root)
1242 		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1243 
1244 	ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1245 	if (ret) {
1246 		if (dst_root != &cgrp_dfl_root)
1247 			return ret;
1248 
1249 		/*
1250 		 * Rebinding back to the default root is not allowed to
1251 		 * fail.  Using both default and non-default roots should
1252 		 * be rare.  Moving subsystems back and forth even more so.
1253 		 * Just warn about it and continue.
1254 		 */
1255 		if (cgrp_dfl_root_visible) {
1256 			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1257 				ret, ss_mask);
1258 			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1259 		}
1260 	}
1261 
1262 	/*
1263 	 * Nothing can fail from this point on.  Remove files for the
1264 	 * removed subsystems and rebind each subsystem.
1265 	 */
1266 	for_each_subsys(ss, ssid)
1267 		if (ss_mask & (1 << ssid))
1268 			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1269 
1270 	for_each_subsys(ss, ssid) {
1271 		struct cgroup_root *src_root;
1272 		struct cgroup_subsys_state *css;
1273 		struct css_set *cset;
1274 
1275 		if (!(ss_mask & (1 << ssid)))
1276 			continue;
1277 
1278 		src_root = ss->root;
1279 		css = cgroup_css(&src_root->cgrp, ss);
1280 
1281 		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1282 
1283 		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1284 		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1285 		ss->root = dst_root;
1286 		css->cgroup = &dst_root->cgrp;
1287 
1288 		down_write(&css_set_rwsem);
1289 		hash_for_each(css_set_table, i, cset, hlist)
1290 			list_move_tail(&cset->e_cset_node[ss->id],
1291 				       &dst_root->cgrp.e_csets[ss->id]);
1292 		up_write(&css_set_rwsem);
1293 
1294 		src_root->subsys_mask &= ~(1 << ssid);
1295 		src_root->cgrp.subtree_control &= ~(1 << ssid);
1296 		cgroup_refresh_child_subsys_mask(&src_root->cgrp);
1297 
1298 		/* default hierarchy doesn't enable controllers by default */
1299 		dst_root->subsys_mask |= 1 << ssid;
1300 		if (dst_root != &cgrp_dfl_root) {
1301 			dst_root->cgrp.subtree_control |= 1 << ssid;
1302 			cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
1303 		}
1304 
1305 		if (ss->bind)
1306 			ss->bind(css);
1307 	}
1308 
1309 	kernfs_activate(dst_root->cgrp.kn);
1310 	return 0;
1311 }
1312 
cgroup_show_options(struct seq_file * seq,struct kernfs_root * kf_root)1313 static int cgroup_show_options(struct seq_file *seq,
1314 			       struct kernfs_root *kf_root)
1315 {
1316 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1317 	struct cgroup_subsys *ss;
1318 	int ssid;
1319 
1320 	for_each_subsys(ss, ssid)
1321 		if (root->subsys_mask & (1 << ssid))
1322 			seq_show_option(seq, ss->name, NULL);
1323 	if (root->flags & CGRP_ROOT_NOPREFIX)
1324 		seq_puts(seq, ",noprefix");
1325 	if (root->flags & CGRP_ROOT_XATTR)
1326 		seq_puts(seq, ",xattr");
1327 
1328 	spin_lock(&release_agent_path_lock);
1329 	if (strlen(root->release_agent_path))
1330 		seq_show_option(seq, "release_agent",
1331 				root->release_agent_path);
1332 	spin_unlock(&release_agent_path_lock);
1333 
1334 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1335 		seq_puts(seq, ",clone_children");
1336 	if (strlen(root->name))
1337 		seq_show_option(seq, "name", root->name);
1338 	return 0;
1339 }
1340 
1341 struct cgroup_sb_opts {
1342 	unsigned int subsys_mask;
1343 	unsigned int flags;
1344 	char *release_agent;
1345 	bool cpuset_clone_children;
1346 	char *name;
1347 	/* User explicitly requested empty subsystem */
1348 	bool none;
1349 };
1350 
parse_cgroupfs_options(char * data,struct cgroup_sb_opts * opts)1351 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1352 {
1353 	char *token, *o = data;
1354 	bool all_ss = false, one_ss = false;
1355 	unsigned int mask = -1U;
1356 	struct cgroup_subsys *ss;
1357 	int nr_opts = 0;
1358 	int i;
1359 
1360 #ifdef CONFIG_CPUSETS
1361 	mask = ~(1U << cpuset_cgrp_id);
1362 #endif
1363 
1364 	memset(opts, 0, sizeof(*opts));
1365 
1366 	while ((token = strsep(&o, ",")) != NULL) {
1367 		nr_opts++;
1368 
1369 		if (!*token)
1370 			return -EINVAL;
1371 		if (!strcmp(token, "none")) {
1372 			/* Explicitly have no subsystems */
1373 			opts->none = true;
1374 			continue;
1375 		}
1376 		if (!strcmp(token, "all")) {
1377 			/* Mutually exclusive option 'all' + subsystem name */
1378 			if (one_ss)
1379 				return -EINVAL;
1380 			all_ss = true;
1381 			continue;
1382 		}
1383 		if (!strcmp(token, "__DEVEL__sane_behavior")) {
1384 			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1385 			continue;
1386 		}
1387 		if (!strcmp(token, "noprefix")) {
1388 			opts->flags |= CGRP_ROOT_NOPREFIX;
1389 			continue;
1390 		}
1391 		if (!strcmp(token, "clone_children")) {
1392 			opts->cpuset_clone_children = true;
1393 			continue;
1394 		}
1395 		if (!strcmp(token, "xattr")) {
1396 			opts->flags |= CGRP_ROOT_XATTR;
1397 			continue;
1398 		}
1399 		if (!strncmp(token, "release_agent=", 14)) {
1400 			/* Specifying two release agents is forbidden */
1401 			if (opts->release_agent)
1402 				return -EINVAL;
1403 			opts->release_agent =
1404 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1405 			if (!opts->release_agent)
1406 				return -ENOMEM;
1407 			continue;
1408 		}
1409 		if (!strncmp(token, "name=", 5)) {
1410 			const char *name = token + 5;
1411 			/* Can't specify an empty name */
1412 			if (!strlen(name))
1413 				return -EINVAL;
1414 			/* Must match [\w.-]+ */
1415 			for (i = 0; i < strlen(name); i++) {
1416 				char c = name[i];
1417 				if (isalnum(c))
1418 					continue;
1419 				if ((c == '.') || (c == '-') || (c == '_'))
1420 					continue;
1421 				return -EINVAL;
1422 			}
1423 			/* Specifying two names is forbidden */
1424 			if (opts->name)
1425 				return -EINVAL;
1426 			opts->name = kstrndup(name,
1427 					      MAX_CGROUP_ROOT_NAMELEN - 1,
1428 					      GFP_KERNEL);
1429 			if (!opts->name)
1430 				return -ENOMEM;
1431 
1432 			continue;
1433 		}
1434 
1435 		for_each_subsys(ss, i) {
1436 			if (strcmp(token, ss->name))
1437 				continue;
1438 			if (ss->disabled)
1439 				continue;
1440 
1441 			/* Mutually exclusive option 'all' + subsystem name */
1442 			if (all_ss)
1443 				return -EINVAL;
1444 			opts->subsys_mask |= (1 << i);
1445 			one_ss = true;
1446 
1447 			break;
1448 		}
1449 		if (i == CGROUP_SUBSYS_COUNT)
1450 			return -ENOENT;
1451 	}
1452 
1453 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1454 		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1455 		if (nr_opts != 1) {
1456 			pr_err("sane_behavior: no other mount options allowed\n");
1457 			return -EINVAL;
1458 		}
1459 		return 0;
1460 	}
1461 
1462 	/*
1463 	 * If the 'all' option was specified select all the subsystems,
1464 	 * otherwise if 'none', 'name=' and a subsystem name options were
1465 	 * not specified, let's default to 'all'
1466 	 */
1467 	if (all_ss || (!one_ss && !opts->none && !opts->name))
1468 		for_each_subsys(ss, i)
1469 			if (!ss->disabled)
1470 				opts->subsys_mask |= (1 << i);
1471 
1472 	/*
1473 	 * We either have to specify by name or by subsystems. (So all
1474 	 * empty hierarchies must have a name).
1475 	 */
1476 	if (!opts->subsys_mask && !opts->name)
1477 		return -EINVAL;
1478 
1479 	/*
1480 	 * Option noprefix was introduced just for backward compatibility
1481 	 * with the old cpuset, so we allow noprefix only if mounting just
1482 	 * the cpuset subsystem.
1483 	 */
1484 	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1485 		return -EINVAL;
1486 
1487 	/* Can't specify "none" and some subsystems */
1488 	if (opts->subsys_mask && opts->none)
1489 		return -EINVAL;
1490 
1491 	return 0;
1492 }
1493 
cgroup_remount(struct kernfs_root * kf_root,int * flags,char * data)1494 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1495 {
1496 	int ret = 0;
1497 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1498 	struct cgroup_sb_opts opts;
1499 	unsigned int added_mask, removed_mask;
1500 
1501 	if (root == &cgrp_dfl_root) {
1502 		pr_err("remount is not allowed\n");
1503 		return -EINVAL;
1504 	}
1505 
1506 	mutex_lock(&cgroup_mutex);
1507 
1508 	/* See what subsystems are wanted */
1509 	ret = parse_cgroupfs_options(data, &opts);
1510 	if (ret)
1511 		goto out_unlock;
1512 
1513 	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1514 		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1515 			task_tgid_nr(current), current->comm);
1516 
1517 	added_mask = opts.subsys_mask & ~root->subsys_mask;
1518 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
1519 
1520 	/* Don't allow flags or name to change at remount */
1521 	if ((opts.flags ^ root->flags) ||
1522 	    (opts.name && strcmp(opts.name, root->name))) {
1523 		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1524 		       opts.flags, opts.name ?: "", root->flags, root->name);
1525 		ret = -EINVAL;
1526 		goto out_unlock;
1527 	}
1528 
1529 	/* remounting is not allowed for populated hierarchies */
1530 	if (!list_empty(&root->cgrp.self.children)) {
1531 		ret = -EBUSY;
1532 		goto out_unlock;
1533 	}
1534 
1535 	ret = rebind_subsystems(root, added_mask);
1536 	if (ret)
1537 		goto out_unlock;
1538 
1539 	rebind_subsystems(&cgrp_dfl_root, removed_mask);
1540 
1541 	if (opts.release_agent) {
1542 		spin_lock(&release_agent_path_lock);
1543 		strcpy(root->release_agent_path, opts.release_agent);
1544 		spin_unlock(&release_agent_path_lock);
1545 	}
1546  out_unlock:
1547 	kfree(opts.release_agent);
1548 	kfree(opts.name);
1549 	mutex_unlock(&cgroup_mutex);
1550 	return ret;
1551 }
1552 
1553 /*
1554  * To reduce the fork() overhead for systems that are not actually using
1555  * their cgroups capability, we don't maintain the lists running through
1556  * each css_set to its tasks until we see the list actually used - in other
1557  * words after the first mount.
1558  */
1559 static bool use_task_css_set_links __read_mostly;
1560 
cgroup_enable_task_cg_lists(void)1561 static void cgroup_enable_task_cg_lists(void)
1562 {
1563 	struct task_struct *p, *g;
1564 
1565 	down_write(&css_set_rwsem);
1566 
1567 	if (use_task_css_set_links)
1568 		goto out_unlock;
1569 
1570 	use_task_css_set_links = true;
1571 
1572 	/*
1573 	 * We need tasklist_lock because RCU is not safe against
1574 	 * while_each_thread(). Besides, a forking task that has passed
1575 	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1576 	 * is not guaranteed to have its child immediately visible in the
1577 	 * tasklist if we walk through it with RCU.
1578 	 */
1579 	read_lock(&tasklist_lock);
1580 	do_each_thread(g, p) {
1581 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1582 			     task_css_set(p) != &init_css_set);
1583 
1584 		/*
1585 		 * We should check if the process is exiting, otherwise
1586 		 * it will race with cgroup_exit() in that the list
1587 		 * entry won't be deleted though the process has exited.
1588 		 * Do it while holding siglock so that we don't end up
1589 		 * racing against cgroup_exit().
1590 		 */
1591 		spin_lock_irq(&p->sighand->siglock);
1592 		if (!(p->flags & PF_EXITING)) {
1593 			struct css_set *cset = task_css_set(p);
1594 
1595 			list_add(&p->cg_list, &cset->tasks);
1596 			get_css_set(cset);
1597 		}
1598 		spin_unlock_irq(&p->sighand->siglock);
1599 	} while_each_thread(g, p);
1600 	read_unlock(&tasklist_lock);
1601 out_unlock:
1602 	up_write(&css_set_rwsem);
1603 }
1604 
init_cgroup_housekeeping(struct cgroup * cgrp)1605 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1606 {
1607 	struct cgroup_subsys *ss;
1608 	int ssid;
1609 
1610 	INIT_LIST_HEAD(&cgrp->self.sibling);
1611 	INIT_LIST_HEAD(&cgrp->self.children);
1612 	INIT_LIST_HEAD(&cgrp->cset_links);
1613 	INIT_LIST_HEAD(&cgrp->pidlists);
1614 	mutex_init(&cgrp->pidlist_mutex);
1615 	cgrp->self.cgroup = cgrp;
1616 	cgrp->self.flags |= CSS_ONLINE;
1617 
1618 	for_each_subsys(ss, ssid)
1619 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1620 
1621 	init_waitqueue_head(&cgrp->offline_waitq);
1622 	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1623 }
1624 
init_cgroup_root(struct cgroup_root * root,struct cgroup_sb_opts * opts)1625 static void init_cgroup_root(struct cgroup_root *root,
1626 			     struct cgroup_sb_opts *opts)
1627 {
1628 	struct cgroup *cgrp = &root->cgrp;
1629 
1630 	INIT_LIST_HEAD(&root->root_list);
1631 	atomic_set(&root->nr_cgrps, 1);
1632 	cgrp->root = root;
1633 	init_cgroup_housekeeping(cgrp);
1634 	idr_init(&root->cgroup_idr);
1635 
1636 	root->flags = opts->flags;
1637 	if (opts->release_agent)
1638 		strcpy(root->release_agent_path, opts->release_agent);
1639 	if (opts->name)
1640 		strcpy(root->name, opts->name);
1641 	if (opts->cpuset_clone_children)
1642 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1643 }
1644 
cgroup_setup_root(struct cgroup_root * root,unsigned int ss_mask)1645 static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1646 {
1647 	LIST_HEAD(tmp_links);
1648 	struct cgroup *root_cgrp = &root->cgrp;
1649 	struct cftype *base_files;
1650 	struct css_set *cset;
1651 	int i, ret;
1652 
1653 	lockdep_assert_held(&cgroup_mutex);
1654 
1655 	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1656 	if (ret < 0)
1657 		goto out;
1658 	root_cgrp->id = ret;
1659 
1660 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1661 			      GFP_KERNEL);
1662 	if (ret)
1663 		goto out;
1664 
1665 	/*
1666 	 * We're accessing css_set_count without locking css_set_rwsem here,
1667 	 * but that's OK - it can only be increased by someone holding
1668 	 * cgroup_lock, and that's us. The worst that can happen is that we
1669 	 * have some link structures left over
1670 	 */
1671 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1672 	if (ret)
1673 		goto cancel_ref;
1674 
1675 	ret = cgroup_init_root_id(root);
1676 	if (ret)
1677 		goto cancel_ref;
1678 
1679 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1680 					   KERNFS_ROOT_CREATE_DEACTIVATED,
1681 					   root_cgrp);
1682 	if (IS_ERR(root->kf_root)) {
1683 		ret = PTR_ERR(root->kf_root);
1684 		goto exit_root_id;
1685 	}
1686 	root_cgrp->kn = root->kf_root->kn;
1687 
1688 	if (root == &cgrp_dfl_root)
1689 		base_files = cgroup_dfl_base_files;
1690 	else
1691 		base_files = cgroup_legacy_base_files;
1692 
1693 	ret = cgroup_addrm_files(root_cgrp, base_files, true);
1694 	if (ret)
1695 		goto destroy_root;
1696 
1697 	ret = rebind_subsystems(root, ss_mask);
1698 	if (ret)
1699 		goto destroy_root;
1700 
1701 	/*
1702 	 * There must be no failure case after here, since rebinding takes
1703 	 * care of subsystems' refcounts, which are explicitly dropped in
1704 	 * the failure exit path.
1705 	 */
1706 	list_add(&root->root_list, &cgroup_roots);
1707 	cgroup_root_count++;
1708 
1709 	/*
1710 	 * Link the root cgroup in this hierarchy into all the css_set
1711 	 * objects.
1712 	 */
1713 	down_write(&css_set_rwsem);
1714 	hash_for_each(css_set_table, i, cset, hlist)
1715 		link_css_set(&tmp_links, cset, root_cgrp);
1716 	up_write(&css_set_rwsem);
1717 
1718 	BUG_ON(!list_empty(&root_cgrp->self.children));
1719 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1720 
1721 	kernfs_activate(root_cgrp->kn);
1722 	ret = 0;
1723 	goto out;
1724 
1725 destroy_root:
1726 	kernfs_destroy_root(root->kf_root);
1727 	root->kf_root = NULL;
1728 exit_root_id:
1729 	cgroup_exit_root_id(root);
1730 cancel_ref:
1731 	percpu_ref_exit(&root_cgrp->self.refcnt);
1732 out:
1733 	free_cgrp_cset_links(&tmp_links);
1734 	return ret;
1735 }
1736 
cgroup_mount(struct file_system_type * fs_type,int flags,const char * unused_dev_name,void * data)1737 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1738 			 int flags, const char *unused_dev_name,
1739 			 void *data)
1740 {
1741 	struct super_block *pinned_sb = NULL;
1742 	struct cgroup_subsys *ss;
1743 	struct cgroup_root *root;
1744 	struct cgroup_sb_opts opts;
1745 	struct dentry *dentry;
1746 	int ret;
1747 	int i;
1748 	bool new_sb;
1749 
1750 	/*
1751 	 * The first time anyone tries to mount a cgroup, enable the list
1752 	 * linking each css_set to its tasks and fix up all existing tasks.
1753 	 */
1754 	if (!use_task_css_set_links)
1755 		cgroup_enable_task_cg_lists();
1756 
1757 	mutex_lock(&cgroup_mutex);
1758 
1759 	/* First find the desired set of subsystems */
1760 	ret = parse_cgroupfs_options(data, &opts);
1761 	if (ret)
1762 		goto out_unlock;
1763 
1764 	/* look for a matching existing root */
1765 	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
1766 		cgrp_dfl_root_visible = true;
1767 		root = &cgrp_dfl_root;
1768 		cgroup_get(&root->cgrp);
1769 		ret = 0;
1770 		goto out_unlock;
1771 	}
1772 
1773 	/*
1774 	 * Destruction of cgroup root is asynchronous, so subsystems may
1775 	 * still be dying after the previous unmount.  Let's drain the
1776 	 * dying subsystems.  We just need to ensure that the ones
1777 	 * unmounted previously finish dying and don't care about new ones
1778 	 * starting.  Testing ref liveliness is good enough.
1779 	 */
1780 	for_each_subsys(ss, i) {
1781 		if (!(opts.subsys_mask & (1 << i)) ||
1782 		    ss->root == &cgrp_dfl_root)
1783 			continue;
1784 
1785 		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1786 			mutex_unlock(&cgroup_mutex);
1787 			msleep(10);
1788 			ret = restart_syscall();
1789 			goto out_free;
1790 		}
1791 		cgroup_put(&ss->root->cgrp);
1792 	}
1793 
1794 	for_each_root(root) {
1795 		bool name_match = false;
1796 
1797 		if (root == &cgrp_dfl_root)
1798 			continue;
1799 
1800 		/*
1801 		 * If we asked for a name then it must match.  Also, if
1802 		 * name matches but sybsys_mask doesn't, we should fail.
1803 		 * Remember whether name matched.
1804 		 */
1805 		if (opts.name) {
1806 			if (strcmp(opts.name, root->name))
1807 				continue;
1808 			name_match = true;
1809 		}
1810 
1811 		/*
1812 		 * If we asked for subsystems (or explicitly for no
1813 		 * subsystems) then they must match.
1814 		 */
1815 		if ((opts.subsys_mask || opts.none) &&
1816 		    (opts.subsys_mask != root->subsys_mask)) {
1817 			if (!name_match)
1818 				continue;
1819 			ret = -EBUSY;
1820 			goto out_unlock;
1821 		}
1822 
1823 		if (root->flags ^ opts.flags)
1824 			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1825 
1826 		/*
1827 		 * We want to reuse @root whose lifetime is governed by its
1828 		 * ->cgrp.  Let's check whether @root is alive and keep it
1829 		 * that way.  As cgroup_kill_sb() can happen anytime, we
1830 		 * want to block it by pinning the sb so that @root doesn't
1831 		 * get killed before mount is complete.
1832 		 *
1833 		 * With the sb pinned, tryget_live can reliably indicate
1834 		 * whether @root can be reused.  If it's being killed,
1835 		 * drain it.  We can use wait_queue for the wait but this
1836 		 * path is super cold.  Let's just sleep a bit and retry.
1837 		 */
1838 		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1839 		if (IS_ERR(pinned_sb) ||
1840 		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1841 			mutex_unlock(&cgroup_mutex);
1842 			if (!IS_ERR_OR_NULL(pinned_sb))
1843 				deactivate_super(pinned_sb);
1844 			msleep(10);
1845 			ret = restart_syscall();
1846 			goto out_free;
1847 		}
1848 
1849 		ret = 0;
1850 		goto out_unlock;
1851 	}
1852 
1853 	/*
1854 	 * No such thing, create a new one.  name= matching without subsys
1855 	 * specification is allowed for already existing hierarchies but we
1856 	 * can't create new one without subsys specification.
1857 	 */
1858 	if (!opts.subsys_mask && !opts.none) {
1859 		ret = -EINVAL;
1860 		goto out_unlock;
1861 	}
1862 
1863 	root = kzalloc(sizeof(*root), GFP_KERNEL);
1864 	if (!root) {
1865 		ret = -ENOMEM;
1866 		goto out_unlock;
1867 	}
1868 
1869 	init_cgroup_root(root, &opts);
1870 
1871 	ret = cgroup_setup_root(root, opts.subsys_mask);
1872 	if (ret)
1873 		cgroup_free_root(root);
1874 
1875 out_unlock:
1876 	mutex_unlock(&cgroup_mutex);
1877 out_free:
1878 	kfree(opts.release_agent);
1879 	kfree(opts.name);
1880 
1881 	if (ret)
1882 		return ERR_PTR(ret);
1883 
1884 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
1885 				CGROUP_SUPER_MAGIC, &new_sb);
1886 	if (IS_ERR(dentry) || !new_sb)
1887 		cgroup_put(&root->cgrp);
1888 
1889 	/*
1890 	 * If @pinned_sb, we're reusing an existing root and holding an
1891 	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
1892 	 */
1893 	if (pinned_sb) {
1894 		WARN_ON(new_sb);
1895 		deactivate_super(pinned_sb);
1896 	}
1897 
1898 	return dentry;
1899 }
1900 
cgroup_kill_sb(struct super_block * sb)1901 static void cgroup_kill_sb(struct super_block *sb)
1902 {
1903 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1904 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1905 
1906 	/*
1907 	 * If @root doesn't have any mounts or children, start killing it.
1908 	 * This prevents new mounts by disabling percpu_ref_tryget_live().
1909 	 * cgroup_mount() may wait for @root's release.
1910 	 *
1911 	 * And don't kill the default root.
1912 	 */
1913 	if (!list_empty(&root->cgrp.self.children) ||
1914 	    root == &cgrp_dfl_root)
1915 		cgroup_put(&root->cgrp);
1916 	else
1917 		percpu_ref_kill(&root->cgrp.self.refcnt);
1918 
1919 	kernfs_kill_sb(sb);
1920 }
1921 
1922 static struct file_system_type cgroup_fs_type = {
1923 	.name = "cgroup",
1924 	.mount = cgroup_mount,
1925 	.kill_sb = cgroup_kill_sb,
1926 };
1927 
1928 /**
1929  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1930  * @task: target task
1931  * @buf: the buffer to write the path into
1932  * @buflen: the length of the buffer
1933  *
1934  * Determine @task's cgroup on the first (the one with the lowest non-zero
1935  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1936  * function grabs cgroup_mutex and shouldn't be used inside locks used by
1937  * cgroup controller callbacks.
1938  *
1939  * Return value is the same as kernfs_path().
1940  */
task_cgroup_path(struct task_struct * task,char * buf,size_t buflen)1941 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1942 {
1943 	struct cgroup_root *root;
1944 	struct cgroup *cgrp;
1945 	int hierarchy_id = 1;
1946 	char *path = NULL;
1947 
1948 	mutex_lock(&cgroup_mutex);
1949 	down_read(&css_set_rwsem);
1950 
1951 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1952 
1953 	if (root) {
1954 		cgrp = task_cgroup_from_root(task, root);
1955 		path = cgroup_path(cgrp, buf, buflen);
1956 	} else {
1957 		/* if no hierarchy exists, everyone is in "/" */
1958 		if (strlcpy(buf, "/", buflen) < buflen)
1959 			path = buf;
1960 	}
1961 
1962 	up_read(&css_set_rwsem);
1963 	mutex_unlock(&cgroup_mutex);
1964 	return path;
1965 }
1966 EXPORT_SYMBOL_GPL(task_cgroup_path);
1967 
1968 /* used to track tasks and other necessary states during migration */
1969 struct cgroup_taskset {
1970 	/* the src and dst cset list running through cset->mg_node */
1971 	struct list_head	src_csets;
1972 	struct list_head	dst_csets;
1973 
1974 	/*
1975 	 * Fields for cgroup_taskset_*() iteration.
1976 	 *
1977 	 * Before migration is committed, the target migration tasks are on
1978 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1979 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
1980 	 * or ->dst_csets depending on whether migration is committed.
1981 	 *
1982 	 * ->cur_csets and ->cur_task point to the current task position
1983 	 * during iteration.
1984 	 */
1985 	struct list_head	*csets;
1986 	struct css_set		*cur_cset;
1987 	struct task_struct	*cur_task;
1988 };
1989 
1990 /**
1991  * cgroup_taskset_first - reset taskset and return the first task
1992  * @tset: taskset of interest
1993  *
1994  * @tset iteration is initialized and the first task is returned.
1995  */
cgroup_taskset_first(struct cgroup_taskset * tset)1996 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1997 {
1998 	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1999 	tset->cur_task = NULL;
2000 
2001 	return cgroup_taskset_next(tset);
2002 }
2003 
2004 /**
2005  * cgroup_taskset_next - iterate to the next task in taskset
2006  * @tset: taskset of interest
2007  *
2008  * Return the next task in @tset.  Iteration must have been initialized
2009  * with cgroup_taskset_first().
2010  */
cgroup_taskset_next(struct cgroup_taskset * tset)2011 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2012 {
2013 	struct css_set *cset = tset->cur_cset;
2014 	struct task_struct *task = tset->cur_task;
2015 
2016 	while (&cset->mg_node != tset->csets) {
2017 		if (!task)
2018 			task = list_first_entry(&cset->mg_tasks,
2019 						struct task_struct, cg_list);
2020 		else
2021 			task = list_next_entry(task, cg_list);
2022 
2023 		if (&task->cg_list != &cset->mg_tasks) {
2024 			tset->cur_cset = cset;
2025 			tset->cur_task = task;
2026 			return task;
2027 		}
2028 
2029 		cset = list_next_entry(cset, mg_node);
2030 		task = NULL;
2031 	}
2032 
2033 	return NULL;
2034 }
2035 
2036 /**
2037  * cgroup_task_migrate - move a task from one cgroup to another.
2038  * @old_cgrp: the cgroup @tsk is being migrated from
2039  * @tsk: the task being migrated
2040  * @new_cset: the new css_set @tsk is being attached to
2041  *
2042  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
2043  */
cgroup_task_migrate(struct cgroup * old_cgrp,struct task_struct * tsk,struct css_set * new_cset)2044 static void cgroup_task_migrate(struct cgroup *old_cgrp,
2045 				struct task_struct *tsk,
2046 				struct css_set *new_cset)
2047 {
2048 	struct css_set *old_cset;
2049 
2050 	lockdep_assert_held(&cgroup_mutex);
2051 	lockdep_assert_held(&css_set_rwsem);
2052 
2053 	/*
2054 	 * We are synchronized through threadgroup_lock() against PF_EXITING
2055 	 * setting such that we can't race against cgroup_exit() changing the
2056 	 * css_set to init_css_set and dropping the old one.
2057 	 */
2058 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
2059 	old_cset = task_css_set(tsk);
2060 
2061 	get_css_set(new_cset);
2062 	rcu_assign_pointer(tsk->cgroups, new_cset);
2063 
2064 	/*
2065 	 * Use move_tail so that cgroup_taskset_first() still returns the
2066 	 * leader after migration.  This works because cgroup_migrate()
2067 	 * ensures that the dst_cset of the leader is the first on the
2068 	 * tset's dst_csets list.
2069 	 */
2070 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
2071 
2072 	/*
2073 	 * We just gained a reference on old_cset by taking it from the
2074 	 * task. As trading it for new_cset is protected by cgroup_mutex,
2075 	 * we're safe to drop it here; it will be freed under RCU.
2076 	 */
2077 	put_css_set_locked(old_cset);
2078 }
2079 
2080 /**
2081  * cgroup_migrate_finish - cleanup after attach
2082  * @preloaded_csets: list of preloaded css_sets
2083  *
2084  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2085  * those functions for details.
2086  */
cgroup_migrate_finish(struct list_head * preloaded_csets)2087 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2088 {
2089 	struct css_set *cset, *tmp_cset;
2090 
2091 	lockdep_assert_held(&cgroup_mutex);
2092 
2093 	down_write(&css_set_rwsem);
2094 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2095 		cset->mg_src_cgrp = NULL;
2096 		cset->mg_dst_cset = NULL;
2097 		list_del_init(&cset->mg_preload_node);
2098 		put_css_set_locked(cset);
2099 	}
2100 	up_write(&css_set_rwsem);
2101 }
2102 
2103 /**
2104  * cgroup_migrate_add_src - add a migration source css_set
2105  * @src_cset: the source css_set to add
2106  * @dst_cgrp: the destination cgroup
2107  * @preloaded_csets: list of preloaded css_sets
2108  *
2109  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2110  * @src_cset and add it to @preloaded_csets, which should later be cleaned
2111  * up by cgroup_migrate_finish().
2112  *
2113  * This function may be called without holding threadgroup_lock even if the
2114  * target is a process.  Threads may be created and destroyed but as long
2115  * as cgroup_mutex is not dropped, no new css_set can be put into play and
2116  * the preloaded css_sets are guaranteed to cover all migrations.
2117  */
cgroup_migrate_add_src(struct css_set * src_cset,struct cgroup * dst_cgrp,struct list_head * preloaded_csets)2118 static void cgroup_migrate_add_src(struct css_set *src_cset,
2119 				   struct cgroup *dst_cgrp,
2120 				   struct list_head *preloaded_csets)
2121 {
2122 	struct cgroup *src_cgrp;
2123 
2124 	lockdep_assert_held(&cgroup_mutex);
2125 	lockdep_assert_held(&css_set_rwsem);
2126 
2127 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2128 
2129 	if (!list_empty(&src_cset->mg_preload_node))
2130 		return;
2131 
2132 	WARN_ON(src_cset->mg_src_cgrp);
2133 	WARN_ON(!list_empty(&src_cset->mg_tasks));
2134 	WARN_ON(!list_empty(&src_cset->mg_node));
2135 
2136 	src_cset->mg_src_cgrp = src_cgrp;
2137 	get_css_set(src_cset);
2138 	list_add(&src_cset->mg_preload_node, preloaded_csets);
2139 }
2140 
2141 /**
2142  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2143  * @dst_cgrp: the destination cgroup (may be %NULL)
2144  * @preloaded_csets: list of preloaded source css_sets
2145  *
2146  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2147  * have been preloaded to @preloaded_csets.  This function looks up and
2148  * pins all destination css_sets, links each to its source, and append them
2149  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2150  * source css_set is assumed to be its cgroup on the default hierarchy.
2151  *
2152  * This function must be called after cgroup_migrate_add_src() has been
2153  * called on each migration source css_set.  After migration is performed
2154  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2155  * @preloaded_csets.
2156  */
cgroup_migrate_prepare_dst(struct cgroup * dst_cgrp,struct list_head * preloaded_csets)2157 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2158 				      struct list_head *preloaded_csets)
2159 {
2160 	LIST_HEAD(csets);
2161 	struct css_set *src_cset, *tmp_cset;
2162 
2163 	lockdep_assert_held(&cgroup_mutex);
2164 
2165 	/*
2166 	 * Except for the root, child_subsys_mask must be zero for a cgroup
2167 	 * with tasks so that child cgroups don't compete against tasks.
2168 	 */
2169 	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2170 	    dst_cgrp->child_subsys_mask)
2171 		return -EBUSY;
2172 
2173 	/* look up the dst cset for each src cset and link it to src */
2174 	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2175 		struct css_set *dst_cset;
2176 
2177 		dst_cset = find_css_set(src_cset,
2178 					dst_cgrp ?: src_cset->dfl_cgrp);
2179 		if (!dst_cset)
2180 			goto err;
2181 
2182 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2183 
2184 		/*
2185 		 * If src cset equals dst, it's noop.  Drop the src.
2186 		 * cgroup_migrate() will skip the cset too.  Note that we
2187 		 * can't handle src == dst as some nodes are used by both.
2188 		 */
2189 		if (src_cset == dst_cset) {
2190 			src_cset->mg_src_cgrp = NULL;
2191 			list_del_init(&src_cset->mg_preload_node);
2192 			put_css_set(src_cset);
2193 			put_css_set(dst_cset);
2194 			continue;
2195 		}
2196 
2197 		src_cset->mg_dst_cset = dst_cset;
2198 
2199 		if (list_empty(&dst_cset->mg_preload_node))
2200 			list_add(&dst_cset->mg_preload_node, &csets);
2201 		else
2202 			put_css_set(dst_cset);
2203 	}
2204 
2205 	list_splice_tail(&csets, preloaded_csets);
2206 	return 0;
2207 err:
2208 	cgroup_migrate_finish(&csets);
2209 	return -ENOMEM;
2210 }
2211 
2212 /**
2213  * cgroup_migrate - migrate a process or task to a cgroup
2214  * @cgrp: the destination cgroup
2215  * @leader: the leader of the process or the task to migrate
2216  * @threadgroup: whether @leader points to the whole process or a single task
2217  *
2218  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2219  * process, the caller must be holding threadgroup_lock of @leader.  The
2220  * caller is also responsible for invoking cgroup_migrate_add_src() and
2221  * cgroup_migrate_prepare_dst() on the targets before invoking this
2222  * function and following up with cgroup_migrate_finish().
2223  *
2224  * As long as a controller's ->can_attach() doesn't fail, this function is
2225  * guaranteed to succeed.  This means that, excluding ->can_attach()
2226  * failure, when migrating multiple targets, the success or failure can be
2227  * decided for all targets by invoking group_migrate_prepare_dst() before
2228  * actually starting migrating.
2229  */
cgroup_migrate(struct cgroup * cgrp,struct task_struct * leader,bool threadgroup)2230 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2231 			  bool threadgroup)
2232 {
2233 	struct cgroup_taskset tset = {
2234 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
2235 		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
2236 		.csets		= &tset.src_csets,
2237 	};
2238 	struct cgroup_subsys_state *css, *failed_css = NULL;
2239 	struct css_set *cset, *tmp_cset;
2240 	struct task_struct *task, *tmp_task;
2241 	int i, ret;
2242 
2243 	/*
2244 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2245 	 * already PF_EXITING could be freed from underneath us unless we
2246 	 * take an rcu_read_lock.
2247 	 */
2248 	down_write(&css_set_rwsem);
2249 	rcu_read_lock();
2250 	task = leader;
2251 	do {
2252 		/* @task either already exited or can't exit until the end */
2253 		if (task->flags & PF_EXITING)
2254 			goto next;
2255 
2256 		/* leave @task alone if post_fork() hasn't linked it yet */
2257 		if (list_empty(&task->cg_list))
2258 			goto next;
2259 
2260 		cset = task_css_set(task);
2261 		if (!cset->mg_src_cgrp)
2262 			goto next;
2263 
2264 		/*
2265 		 * cgroup_taskset_first() must always return the leader.
2266 		 * Take care to avoid disturbing the ordering.
2267 		 */
2268 		list_move_tail(&task->cg_list, &cset->mg_tasks);
2269 		if (list_empty(&cset->mg_node))
2270 			list_add_tail(&cset->mg_node, &tset.src_csets);
2271 		if (list_empty(&cset->mg_dst_cset->mg_node))
2272 			list_move_tail(&cset->mg_dst_cset->mg_node,
2273 				       &tset.dst_csets);
2274 	next:
2275 		if (!threadgroup)
2276 			break;
2277 	} while_each_thread(leader, task);
2278 	rcu_read_unlock();
2279 	up_write(&css_set_rwsem);
2280 
2281 	/* methods shouldn't be called if no task is actually migrating */
2282 	if (list_empty(&tset.src_csets))
2283 		return 0;
2284 
2285 	/* check that we can legitimately attach to the cgroup */
2286 	for_each_e_css(css, i, cgrp) {
2287 		if (css->ss->can_attach) {
2288 			ret = css->ss->can_attach(css, &tset);
2289 			if (ret) {
2290 				failed_css = css;
2291 				goto out_cancel_attach;
2292 			}
2293 		}
2294 	}
2295 
2296 	/*
2297 	 * Now that we're guaranteed success, proceed to move all tasks to
2298 	 * the new cgroup.  There are no failure cases after here, so this
2299 	 * is the commit point.
2300 	 */
2301 	down_write(&css_set_rwsem);
2302 	list_for_each_entry(cset, &tset.src_csets, mg_node) {
2303 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2304 			cgroup_task_migrate(cset->mg_src_cgrp, task,
2305 					    cset->mg_dst_cset);
2306 	}
2307 	up_write(&css_set_rwsem);
2308 
2309 	/*
2310 	 * Migration is committed, all target tasks are now on dst_csets.
2311 	 * Nothing is sensitive to fork() after this point.  Notify
2312 	 * controllers that migration is complete.
2313 	 */
2314 	tset.csets = &tset.dst_csets;
2315 
2316 	for_each_e_css(css, i, cgrp)
2317 		if (css->ss->attach)
2318 			css->ss->attach(css, &tset);
2319 
2320 	ret = 0;
2321 	goto out_release_tset;
2322 
2323 out_cancel_attach:
2324 	for_each_e_css(css, i, cgrp) {
2325 		if (css == failed_css)
2326 			break;
2327 		if (css->ss->cancel_attach)
2328 			css->ss->cancel_attach(css, &tset);
2329 	}
2330 out_release_tset:
2331 	down_write(&css_set_rwsem);
2332 	list_splice_init(&tset.dst_csets, &tset.src_csets);
2333 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2334 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2335 		list_del_init(&cset->mg_node);
2336 	}
2337 	up_write(&css_set_rwsem);
2338 	return ret;
2339 }
2340 
2341 /**
2342  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2343  * @dst_cgrp: the cgroup to attach to
2344  * @leader: the task or the leader of the threadgroup to be attached
2345  * @threadgroup: attach the whole threadgroup?
2346  *
2347  * Call holding cgroup_mutex and threadgroup_lock of @leader.
2348  */
cgroup_attach_task(struct cgroup * dst_cgrp,struct task_struct * leader,bool threadgroup)2349 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2350 			      struct task_struct *leader, bool threadgroup)
2351 {
2352 	LIST_HEAD(preloaded_csets);
2353 	struct task_struct *task;
2354 	int ret;
2355 
2356 	/* look up all src csets */
2357 	down_read(&css_set_rwsem);
2358 	rcu_read_lock();
2359 	task = leader;
2360 	do {
2361 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2362 				       &preloaded_csets);
2363 		if (!threadgroup)
2364 			break;
2365 	} while_each_thread(leader, task);
2366 	rcu_read_unlock();
2367 	up_read(&css_set_rwsem);
2368 
2369 	/* prepare dst csets and commit */
2370 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2371 	if (!ret)
2372 		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2373 
2374 	cgroup_migrate_finish(&preloaded_csets);
2375 	return ret;
2376 }
2377 
2378 /*
2379  * Find the task_struct of the task to attach by vpid and pass it along to the
2380  * function to attach either it or all tasks in its threadgroup. Will lock
2381  * cgroup_mutex and threadgroup.
2382  */
__cgroup_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,bool threadgroup)2383 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2384 				    size_t nbytes, loff_t off, bool threadgroup)
2385 {
2386 	struct task_struct *tsk;
2387 	const struct cred *cred = current_cred(), *tcred;
2388 	struct cgroup *cgrp;
2389 	pid_t pid;
2390 	int ret;
2391 
2392 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2393 		return -EINVAL;
2394 
2395 	cgrp = cgroup_kn_lock_live(of->kn);
2396 	if (!cgrp)
2397 		return -ENODEV;
2398 
2399 retry_find_task:
2400 	rcu_read_lock();
2401 	if (pid) {
2402 		tsk = find_task_by_vpid(pid);
2403 		if (!tsk) {
2404 			rcu_read_unlock();
2405 			ret = -ESRCH;
2406 			goto out_unlock_cgroup;
2407 		}
2408 		/*
2409 		 * even if we're attaching all tasks in the thread group, we
2410 		 * only need to check permissions on one of them.
2411 		 */
2412 		tcred = __task_cred(tsk);
2413 		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2414 		    !uid_eq(cred->euid, tcred->uid) &&
2415 		    !uid_eq(cred->euid, tcred->suid)) {
2416 			rcu_read_unlock();
2417 			ret = -EACCES;
2418 			goto out_unlock_cgroup;
2419 		}
2420 	} else
2421 		tsk = current;
2422 
2423 	if (threadgroup)
2424 		tsk = tsk->group_leader;
2425 
2426 	/*
2427 	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2428 	 * trapped in a cpuset, or RT worker may be born in a cgroup
2429 	 * with no rt_runtime allocated.  Just say no.
2430 	 */
2431 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2432 		ret = -EINVAL;
2433 		rcu_read_unlock();
2434 		goto out_unlock_cgroup;
2435 	}
2436 
2437 	get_task_struct(tsk);
2438 	rcu_read_unlock();
2439 
2440 	threadgroup_lock(tsk);
2441 	if (threadgroup) {
2442 		if (!thread_group_leader(tsk)) {
2443 			/*
2444 			 * a race with de_thread from another thread's exec()
2445 			 * may strip us of our leadership, if this happens,
2446 			 * there is no choice but to throw this task away and
2447 			 * try again; this is
2448 			 * "double-double-toil-and-trouble-check locking".
2449 			 */
2450 			threadgroup_unlock(tsk);
2451 			put_task_struct(tsk);
2452 			goto retry_find_task;
2453 		}
2454 	}
2455 
2456 	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2457 
2458 	threadgroup_unlock(tsk);
2459 
2460 	put_task_struct(tsk);
2461 out_unlock_cgroup:
2462 	cgroup_kn_unlock(of->kn);
2463 	return ret ?: nbytes;
2464 }
2465 
2466 /**
2467  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2468  * @from: attach to all cgroups of a given task
2469  * @tsk: the task to be attached
2470  */
cgroup_attach_task_all(struct task_struct * from,struct task_struct * tsk)2471 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2472 {
2473 	struct cgroup_root *root;
2474 	int retval = 0;
2475 
2476 	mutex_lock(&cgroup_mutex);
2477 	for_each_root(root) {
2478 		struct cgroup *from_cgrp;
2479 
2480 		if (root == &cgrp_dfl_root)
2481 			continue;
2482 
2483 		down_read(&css_set_rwsem);
2484 		from_cgrp = task_cgroup_from_root(from, root);
2485 		up_read(&css_set_rwsem);
2486 
2487 		retval = cgroup_attach_task(from_cgrp, tsk, false);
2488 		if (retval)
2489 			break;
2490 	}
2491 	mutex_unlock(&cgroup_mutex);
2492 
2493 	return retval;
2494 }
2495 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2496 
cgroup_tasks_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2497 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2498 				  char *buf, size_t nbytes, loff_t off)
2499 {
2500 	return __cgroup_procs_write(of, buf, nbytes, off, false);
2501 }
2502 
cgroup_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2503 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2504 				  char *buf, size_t nbytes, loff_t off)
2505 {
2506 	return __cgroup_procs_write(of, buf, nbytes, off, true);
2507 }
2508 
cgroup_release_agent_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2509 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2510 					  char *buf, size_t nbytes, loff_t off)
2511 {
2512 	struct cgroup *cgrp;
2513 
2514 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2515 
2516 	cgrp = cgroup_kn_lock_live(of->kn);
2517 	if (!cgrp)
2518 		return -ENODEV;
2519 	spin_lock(&release_agent_path_lock);
2520 	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2521 		sizeof(cgrp->root->release_agent_path));
2522 	spin_unlock(&release_agent_path_lock);
2523 	cgroup_kn_unlock(of->kn);
2524 	return nbytes;
2525 }
2526 
cgroup_release_agent_show(struct seq_file * seq,void * v)2527 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2528 {
2529 	struct cgroup *cgrp = seq_css(seq)->cgroup;
2530 
2531 	spin_lock(&release_agent_path_lock);
2532 	seq_puts(seq, cgrp->root->release_agent_path);
2533 	spin_unlock(&release_agent_path_lock);
2534 	seq_putc(seq, '\n');
2535 	return 0;
2536 }
2537 
cgroup_sane_behavior_show(struct seq_file * seq,void * v)2538 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2539 {
2540 	seq_puts(seq, "0\n");
2541 	return 0;
2542 }
2543 
cgroup_print_ss_mask(struct seq_file * seq,unsigned int ss_mask)2544 static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2545 {
2546 	struct cgroup_subsys *ss;
2547 	bool printed = false;
2548 	int ssid;
2549 
2550 	for_each_subsys(ss, ssid) {
2551 		if (ss_mask & (1 << ssid)) {
2552 			if (printed)
2553 				seq_putc(seq, ' ');
2554 			seq_printf(seq, "%s", ss->name);
2555 			printed = true;
2556 		}
2557 	}
2558 	if (printed)
2559 		seq_putc(seq, '\n');
2560 }
2561 
2562 /* show controllers which are currently attached to the default hierarchy */
cgroup_root_controllers_show(struct seq_file * seq,void * v)2563 static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2564 {
2565 	struct cgroup *cgrp = seq_css(seq)->cgroup;
2566 
2567 	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2568 			     ~cgrp_dfl_root_inhibit_ss_mask);
2569 	return 0;
2570 }
2571 
2572 /* show controllers which are enabled from the parent */
cgroup_controllers_show(struct seq_file * seq,void * v)2573 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2574 {
2575 	struct cgroup *cgrp = seq_css(seq)->cgroup;
2576 
2577 	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
2578 	return 0;
2579 }
2580 
2581 /* show controllers which are enabled for a given cgroup's children */
cgroup_subtree_control_show(struct seq_file * seq,void * v)2582 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2583 {
2584 	struct cgroup *cgrp = seq_css(seq)->cgroup;
2585 
2586 	cgroup_print_ss_mask(seq, cgrp->subtree_control);
2587 	return 0;
2588 }
2589 
2590 /**
2591  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2592  * @cgrp: root of the subtree to update csses for
2593  *
2594  * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2595  * css associations need to be updated accordingly.  This function looks up
2596  * all css_sets which are attached to the subtree, creates the matching
2597  * updated css_sets and migrates the tasks to the new ones.
2598  */
cgroup_update_dfl_csses(struct cgroup * cgrp)2599 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2600 {
2601 	LIST_HEAD(preloaded_csets);
2602 	struct cgroup_subsys_state *css;
2603 	struct css_set *src_cset;
2604 	int ret;
2605 
2606 	lockdep_assert_held(&cgroup_mutex);
2607 
2608 	/* look up all csses currently attached to @cgrp's subtree */
2609 	down_read(&css_set_rwsem);
2610 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2611 		struct cgrp_cset_link *link;
2612 
2613 		/* self is not affected by child_subsys_mask change */
2614 		if (css->cgroup == cgrp)
2615 			continue;
2616 
2617 		list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2618 			cgroup_migrate_add_src(link->cset, cgrp,
2619 					       &preloaded_csets);
2620 	}
2621 	up_read(&css_set_rwsem);
2622 
2623 	/* NULL dst indicates self on default hierarchy */
2624 	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2625 	if (ret)
2626 		goto out_finish;
2627 
2628 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2629 		struct task_struct *last_task = NULL, *task;
2630 
2631 		/* src_csets precede dst_csets, break on the first dst_cset */
2632 		if (!src_cset->mg_src_cgrp)
2633 			break;
2634 
2635 		/*
2636 		 * All tasks in src_cset need to be migrated to the
2637 		 * matching dst_cset.  Empty it process by process.  We
2638 		 * walk tasks but migrate processes.  The leader might even
2639 		 * belong to a different cset but such src_cset would also
2640 		 * be among the target src_csets because the default
2641 		 * hierarchy enforces per-process membership.
2642 		 */
2643 		while (true) {
2644 			down_read(&css_set_rwsem);
2645 			task = list_first_entry_or_null(&src_cset->tasks,
2646 						struct task_struct, cg_list);
2647 			if (task) {
2648 				task = task->group_leader;
2649 				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2650 				get_task_struct(task);
2651 			}
2652 			up_read(&css_set_rwsem);
2653 
2654 			if (!task)
2655 				break;
2656 
2657 			/* guard against possible infinite loop */
2658 			if (WARN(last_task == task,
2659 				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2660 				goto out_finish;
2661 			last_task = task;
2662 
2663 			threadgroup_lock(task);
2664 			/* raced against de_thread() from another thread? */
2665 			if (!thread_group_leader(task)) {
2666 				threadgroup_unlock(task);
2667 				put_task_struct(task);
2668 				continue;
2669 			}
2670 
2671 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2672 
2673 			threadgroup_unlock(task);
2674 			put_task_struct(task);
2675 
2676 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2677 				goto out_finish;
2678 		}
2679 	}
2680 
2681 out_finish:
2682 	cgroup_migrate_finish(&preloaded_csets);
2683 	return ret;
2684 }
2685 
2686 /* change the enabled child controllers for a cgroup in the default hierarchy */
cgroup_subtree_control_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2687 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2688 					    char *buf, size_t nbytes,
2689 					    loff_t off)
2690 {
2691 	unsigned int enable = 0, disable = 0;
2692 	unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2693 	struct cgroup *cgrp, *child;
2694 	struct cgroup_subsys *ss;
2695 	char *tok;
2696 	int ssid, ret;
2697 
2698 	/*
2699 	 * Parse input - space separated list of subsystem names prefixed
2700 	 * with either + or -.
2701 	 */
2702 	buf = strstrip(buf);
2703 	while ((tok = strsep(&buf, " "))) {
2704 		if (tok[0] == '\0')
2705 			continue;
2706 		for_each_subsys(ss, ssid) {
2707 			if (ss->disabled || strcmp(tok + 1, ss->name) ||
2708 			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2709 				continue;
2710 
2711 			if (*tok == '+') {
2712 				enable |= 1 << ssid;
2713 				disable &= ~(1 << ssid);
2714 			} else if (*tok == '-') {
2715 				disable |= 1 << ssid;
2716 				enable &= ~(1 << ssid);
2717 			} else {
2718 				return -EINVAL;
2719 			}
2720 			break;
2721 		}
2722 		if (ssid == CGROUP_SUBSYS_COUNT)
2723 			return -EINVAL;
2724 	}
2725 
2726 	cgrp = cgroup_kn_lock_live(of->kn);
2727 	if (!cgrp)
2728 		return -ENODEV;
2729 
2730 	for_each_subsys(ss, ssid) {
2731 		if (enable & (1 << ssid)) {
2732 			if (cgrp->subtree_control & (1 << ssid)) {
2733 				enable &= ~(1 << ssid);
2734 				continue;
2735 			}
2736 
2737 			/* unavailable or not enabled on the parent? */
2738 			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2739 			    (cgroup_parent(cgrp) &&
2740 			     !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
2741 				ret = -ENOENT;
2742 				goto out_unlock;
2743 			}
2744 		} else if (disable & (1 << ssid)) {
2745 			if (!(cgrp->subtree_control & (1 << ssid))) {
2746 				disable &= ~(1 << ssid);
2747 				continue;
2748 			}
2749 
2750 			/* a child has it enabled? */
2751 			cgroup_for_each_live_child(child, cgrp) {
2752 				if (child->subtree_control & (1 << ssid)) {
2753 					ret = -EBUSY;
2754 					goto out_unlock;
2755 				}
2756 			}
2757 		}
2758 	}
2759 
2760 	if (!enable && !disable) {
2761 		ret = 0;
2762 		goto out_unlock;
2763 	}
2764 
2765 	/*
2766 	 * Except for the root, subtree_control must be zero for a cgroup
2767 	 * with tasks so that child cgroups don't compete against tasks.
2768 	 */
2769 	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2770 		ret = -EBUSY;
2771 		goto out_unlock;
2772 	}
2773 
2774 	/*
2775 	 * Update subsys masks and calculate what needs to be done.  More
2776 	 * subsystems than specified may need to be enabled or disabled
2777 	 * depending on subsystem dependencies.
2778 	 */
2779 	old_sc = cgrp->subtree_control;
2780 	old_ss = cgrp->child_subsys_mask;
2781 	new_sc = (old_sc | enable) & ~disable;
2782 	new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
2783 
2784 	css_enable = ~old_ss & new_ss;
2785 	css_disable = old_ss & ~new_ss;
2786 	enable |= css_enable;
2787 	disable |= css_disable;
2788 
2789 	/*
2790 	 * Because css offlining is asynchronous, userland might try to
2791 	 * re-enable the same controller while the previous instance is
2792 	 * still around.  In such cases, wait till it's gone using
2793 	 * offline_waitq.
2794 	 */
2795 	for_each_subsys(ss, ssid) {
2796 		if (!(css_enable & (1 << ssid)))
2797 			continue;
2798 
2799 		cgroup_for_each_live_child(child, cgrp) {
2800 			DEFINE_WAIT(wait);
2801 
2802 			if (!cgroup_css(child, ss))
2803 				continue;
2804 
2805 			cgroup_get(child);
2806 			prepare_to_wait(&child->offline_waitq, &wait,
2807 					TASK_UNINTERRUPTIBLE);
2808 			cgroup_kn_unlock(of->kn);
2809 			schedule();
2810 			finish_wait(&child->offline_waitq, &wait);
2811 			cgroup_put(child);
2812 
2813 			return restart_syscall();
2814 		}
2815 	}
2816 
2817 	cgrp->subtree_control = new_sc;
2818 	cgrp->child_subsys_mask = new_ss;
2819 
2820 	/*
2821 	 * Create new csses or make the existing ones visible.  A css is
2822 	 * created invisible if it's being implicitly enabled through
2823 	 * dependency.  An invisible css is made visible when the userland
2824 	 * explicitly enables it.
2825 	 */
2826 	for_each_subsys(ss, ssid) {
2827 		if (!(enable & (1 << ssid)))
2828 			continue;
2829 
2830 		cgroup_for_each_live_child(child, cgrp) {
2831 			if (css_enable & (1 << ssid))
2832 				ret = create_css(child, ss,
2833 					cgrp->subtree_control & (1 << ssid));
2834 			else
2835 				ret = cgroup_populate_dir(child, 1 << ssid);
2836 			if (ret)
2837 				goto err_undo_css;
2838 		}
2839 	}
2840 
2841 	/*
2842 	 * At this point, cgroup_e_css() results reflect the new csses
2843 	 * making the following cgroup_update_dfl_csses() properly update
2844 	 * css associations of all tasks in the subtree.
2845 	 */
2846 	ret = cgroup_update_dfl_csses(cgrp);
2847 	if (ret)
2848 		goto err_undo_css;
2849 
2850 	/*
2851 	 * All tasks are migrated out of disabled csses.  Kill or hide
2852 	 * them.  A css is hidden when the userland requests it to be
2853 	 * disabled while other subsystems are still depending on it.  The
2854 	 * css must not actively control resources and be in the vanilla
2855 	 * state if it's made visible again later.  Controllers which may
2856 	 * be depended upon should provide ->css_reset() for this purpose.
2857 	 */
2858 	for_each_subsys(ss, ssid) {
2859 		if (!(disable & (1 << ssid)))
2860 			continue;
2861 
2862 		cgroup_for_each_live_child(child, cgrp) {
2863 			struct cgroup_subsys_state *css = cgroup_css(child, ss);
2864 
2865 			if (css_disable & (1 << ssid)) {
2866 				kill_css(css);
2867 			} else {
2868 				cgroup_clear_dir(child, 1 << ssid);
2869 				if (ss->css_reset)
2870 					ss->css_reset(css);
2871 			}
2872 		}
2873 	}
2874 
2875 	/*
2876 	 * The effective csses of all the descendants (excluding @cgrp) may
2877 	 * have changed.  Subsystems can optionally subscribe to this event
2878 	 * by implementing ->css_e_css_changed() which is invoked if any of
2879 	 * the effective csses seen from the css's cgroup may have changed.
2880 	 */
2881 	for_each_subsys(ss, ssid) {
2882 		struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
2883 		struct cgroup_subsys_state *css;
2884 
2885 		if (!ss->css_e_css_changed || !this_css)
2886 			continue;
2887 
2888 		css_for_each_descendant_pre(css, this_css)
2889 			if (css != this_css)
2890 				ss->css_e_css_changed(css);
2891 	}
2892 
2893 	kernfs_activate(cgrp->kn);
2894 	ret = 0;
2895 out_unlock:
2896 	cgroup_kn_unlock(of->kn);
2897 	return ret ?: nbytes;
2898 
2899 err_undo_css:
2900 	cgrp->subtree_control = old_sc;
2901 	cgrp->child_subsys_mask = old_ss;
2902 
2903 	for_each_subsys(ss, ssid) {
2904 		if (!(enable & (1 << ssid)))
2905 			continue;
2906 
2907 		cgroup_for_each_live_child(child, cgrp) {
2908 			struct cgroup_subsys_state *css = cgroup_css(child, ss);
2909 
2910 			if (!css)
2911 				continue;
2912 
2913 			if (css_enable & (1 << ssid))
2914 				kill_css(css);
2915 			else
2916 				cgroup_clear_dir(child, 1 << ssid);
2917 		}
2918 	}
2919 	goto out_unlock;
2920 }
2921 
cgroup_populated_show(struct seq_file * seq,void * v)2922 static int cgroup_populated_show(struct seq_file *seq, void *v)
2923 {
2924 	seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2925 	return 0;
2926 }
2927 
cgroup_file_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2928 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2929 				 size_t nbytes, loff_t off)
2930 {
2931 	struct cgroup *cgrp = of->kn->parent->priv;
2932 	struct cftype *cft = of->kn->priv;
2933 	struct cgroup_subsys_state *css;
2934 	int ret;
2935 
2936 	if (cft->write)
2937 		return cft->write(of, buf, nbytes, off);
2938 
2939 	/*
2940 	 * kernfs guarantees that a file isn't deleted with operations in
2941 	 * flight, which means that the matching css is and stays alive and
2942 	 * doesn't need to be pinned.  The RCU locking is not necessary
2943 	 * either.  It's just for the convenience of using cgroup_css().
2944 	 */
2945 	rcu_read_lock();
2946 	css = cgroup_css(cgrp, cft->ss);
2947 	rcu_read_unlock();
2948 
2949 	if (cft->write_u64) {
2950 		unsigned long long v;
2951 		ret = kstrtoull(buf, 0, &v);
2952 		if (!ret)
2953 			ret = cft->write_u64(css, cft, v);
2954 	} else if (cft->write_s64) {
2955 		long long v;
2956 		ret = kstrtoll(buf, 0, &v);
2957 		if (!ret)
2958 			ret = cft->write_s64(css, cft, v);
2959 	} else {
2960 		ret = -EINVAL;
2961 	}
2962 
2963 	return ret ?: nbytes;
2964 }
2965 
cgroup_seqfile_start(struct seq_file * seq,loff_t * ppos)2966 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2967 {
2968 	return seq_cft(seq)->seq_start(seq, ppos);
2969 }
2970 
cgroup_seqfile_next(struct seq_file * seq,void * v,loff_t * ppos)2971 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2972 {
2973 	return seq_cft(seq)->seq_next(seq, v, ppos);
2974 }
2975 
cgroup_seqfile_stop(struct seq_file * seq,void * v)2976 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2977 {
2978 	seq_cft(seq)->seq_stop(seq, v);
2979 }
2980 
cgroup_seqfile_show(struct seq_file * m,void * arg)2981 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2982 {
2983 	struct cftype *cft = seq_cft(m);
2984 	struct cgroup_subsys_state *css = seq_css(m);
2985 
2986 	if (cft->seq_show)
2987 		return cft->seq_show(m, arg);
2988 
2989 	if (cft->read_u64)
2990 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2991 	else if (cft->read_s64)
2992 		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2993 	else
2994 		return -EINVAL;
2995 	return 0;
2996 }
2997 
2998 static struct kernfs_ops cgroup_kf_single_ops = {
2999 	.atomic_write_len	= PAGE_SIZE,
3000 	.write			= cgroup_file_write,
3001 	.seq_show		= cgroup_seqfile_show,
3002 };
3003 
3004 static struct kernfs_ops cgroup_kf_ops = {
3005 	.atomic_write_len	= PAGE_SIZE,
3006 	.write			= cgroup_file_write,
3007 	.seq_start		= cgroup_seqfile_start,
3008 	.seq_next		= cgroup_seqfile_next,
3009 	.seq_stop		= cgroup_seqfile_stop,
3010 	.seq_show		= cgroup_seqfile_show,
3011 };
3012 
3013 /*
3014  * cgroup_rename - Only allow simple rename of directories in place.
3015  */
cgroup_rename(struct kernfs_node * kn,struct kernfs_node * new_parent,const char * new_name_str)3016 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3017 			 const char *new_name_str)
3018 {
3019 	struct cgroup *cgrp = kn->priv;
3020 	int ret;
3021 
3022 	if (kernfs_type(kn) != KERNFS_DIR)
3023 		return -ENOTDIR;
3024 	if (kn->parent != new_parent)
3025 		return -EIO;
3026 
3027 	/*
3028 	 * This isn't a proper migration and its usefulness is very
3029 	 * limited.  Disallow on the default hierarchy.
3030 	 */
3031 	if (cgroup_on_dfl(cgrp))
3032 		return -EPERM;
3033 
3034 	/*
3035 	 * We're gonna grab cgroup_mutex which nests outside kernfs
3036 	 * active_ref.  kernfs_rename() doesn't require active_ref
3037 	 * protection.  Break them before grabbing cgroup_mutex.
3038 	 */
3039 	kernfs_break_active_protection(new_parent);
3040 	kernfs_break_active_protection(kn);
3041 
3042 	mutex_lock(&cgroup_mutex);
3043 
3044 	ret = kernfs_rename(kn, new_parent, new_name_str);
3045 
3046 	mutex_unlock(&cgroup_mutex);
3047 
3048 	kernfs_unbreak_active_protection(kn);
3049 	kernfs_unbreak_active_protection(new_parent);
3050 	return ret;
3051 }
3052 
3053 /* set uid and gid of cgroup dirs and files to that of the creator */
cgroup_kn_set_ugid(struct kernfs_node * kn)3054 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3055 {
3056 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3057 			       .ia_uid = current_fsuid(),
3058 			       .ia_gid = current_fsgid(), };
3059 
3060 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3061 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3062 		return 0;
3063 
3064 	return kernfs_setattr(kn, &iattr);
3065 }
3066 
cgroup_add_file(struct cgroup * cgrp,struct cftype * cft)3067 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3068 {
3069 	char name[CGROUP_FILE_NAME_MAX];
3070 	struct kernfs_node *kn;
3071 	struct lock_class_key *key = NULL;
3072 	int ret;
3073 
3074 #ifdef CONFIG_DEBUG_LOCK_ALLOC
3075 	key = &cft->lockdep_key;
3076 #endif
3077 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3078 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3079 				  NULL, key);
3080 	if (IS_ERR(kn))
3081 		return PTR_ERR(kn);
3082 
3083 	ret = cgroup_kn_set_ugid(kn);
3084 	if (ret) {
3085 		kernfs_remove(kn);
3086 		return ret;
3087 	}
3088 
3089 	if (cft->seq_show == cgroup_populated_show)
3090 		cgrp->populated_kn = kn;
3091 	return 0;
3092 }
3093 
3094 /**
3095  * cgroup_addrm_files - add or remove files to a cgroup directory
3096  * @cgrp: the target cgroup
3097  * @cfts: array of cftypes to be added
3098  * @is_add: whether to add or remove
3099  *
3100  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3101  * For removals, this function never fails.  If addition fails, this
3102  * function doesn't remove files already added.  The caller is responsible
3103  * for cleaning up.
3104  */
cgroup_addrm_files(struct cgroup * cgrp,struct cftype cfts[],bool is_add)3105 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
3106 			      bool is_add)
3107 {
3108 	struct cftype *cft;
3109 	int ret;
3110 
3111 	lockdep_assert_held(&cgroup_mutex);
3112 
3113 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3114 		/* does cft->flags tell us to skip this file on @cgrp? */
3115 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3116 			continue;
3117 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3118 			continue;
3119 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3120 			continue;
3121 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3122 			continue;
3123 
3124 		if (is_add) {
3125 			ret = cgroup_add_file(cgrp, cft);
3126 			if (ret) {
3127 				pr_warn("%s: failed to add %s, err=%d\n",
3128 					__func__, cft->name, ret);
3129 				return ret;
3130 			}
3131 		} else {
3132 			cgroup_rm_file(cgrp, cft);
3133 		}
3134 	}
3135 	return 0;
3136 }
3137 
cgroup_apply_cftypes(struct cftype * cfts,bool is_add)3138 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3139 {
3140 	LIST_HEAD(pending);
3141 	struct cgroup_subsys *ss = cfts[0].ss;
3142 	struct cgroup *root = &ss->root->cgrp;
3143 	struct cgroup_subsys_state *css;
3144 	int ret = 0;
3145 
3146 	lockdep_assert_held(&cgroup_mutex);
3147 
3148 	/* add/rm files for all cgroups created before */
3149 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3150 		struct cgroup *cgrp = css->cgroup;
3151 
3152 		if (cgroup_is_dead(cgrp))
3153 			continue;
3154 
3155 		ret = cgroup_addrm_files(cgrp, cfts, is_add);
3156 		if (ret)
3157 			break;
3158 	}
3159 
3160 	if (is_add && !ret)
3161 		kernfs_activate(root->kn);
3162 	return ret;
3163 }
3164 
cgroup_exit_cftypes(struct cftype * cfts)3165 static void cgroup_exit_cftypes(struct cftype *cfts)
3166 {
3167 	struct cftype *cft;
3168 
3169 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3170 		/* free copy for custom atomic_write_len, see init_cftypes() */
3171 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3172 			kfree(cft->kf_ops);
3173 		cft->kf_ops = NULL;
3174 		cft->ss = NULL;
3175 
3176 		/* revert flags set by cgroup core while adding @cfts */
3177 		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3178 	}
3179 }
3180 
cgroup_init_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3181 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3182 {
3183 	struct cftype *cft;
3184 
3185 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3186 		struct kernfs_ops *kf_ops;
3187 
3188 		WARN_ON(cft->ss || cft->kf_ops);
3189 
3190 		if (cft->seq_start)
3191 			kf_ops = &cgroup_kf_ops;
3192 		else
3193 			kf_ops = &cgroup_kf_single_ops;
3194 
3195 		/*
3196 		 * Ugh... if @cft wants a custom max_write_len, we need to
3197 		 * make a copy of kf_ops to set its atomic_write_len.
3198 		 */
3199 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3200 			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3201 			if (!kf_ops) {
3202 				cgroup_exit_cftypes(cfts);
3203 				return -ENOMEM;
3204 			}
3205 			kf_ops->atomic_write_len = cft->max_write_len;
3206 		}
3207 
3208 		cft->kf_ops = kf_ops;
3209 		cft->ss = ss;
3210 	}
3211 
3212 	return 0;
3213 }
3214 
cgroup_rm_cftypes_locked(struct cftype * cfts)3215 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3216 {
3217 	lockdep_assert_held(&cgroup_mutex);
3218 
3219 	if (!cfts || !cfts[0].ss)
3220 		return -ENOENT;
3221 
3222 	list_del(&cfts->node);
3223 	cgroup_apply_cftypes(cfts, false);
3224 	cgroup_exit_cftypes(cfts);
3225 	return 0;
3226 }
3227 
3228 /**
3229  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3230  * @cfts: zero-length name terminated array of cftypes
3231  *
3232  * Unregister @cfts.  Files described by @cfts are removed from all
3233  * existing cgroups and all future cgroups won't have them either.  This
3234  * function can be called anytime whether @cfts' subsys is attached or not.
3235  *
3236  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3237  * registered.
3238  */
cgroup_rm_cftypes(struct cftype * cfts)3239 int cgroup_rm_cftypes(struct cftype *cfts)
3240 {
3241 	int ret;
3242 
3243 	mutex_lock(&cgroup_mutex);
3244 	ret = cgroup_rm_cftypes_locked(cfts);
3245 	mutex_unlock(&cgroup_mutex);
3246 	return ret;
3247 }
3248 
3249 /**
3250  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3251  * @ss: target cgroup subsystem
3252  * @cfts: zero-length name terminated array of cftypes
3253  *
3254  * Register @cfts to @ss.  Files described by @cfts are created for all
3255  * existing cgroups to which @ss is attached and all future cgroups will
3256  * have them too.  This function can be called anytime whether @ss is
3257  * attached or not.
3258  *
3259  * Returns 0 on successful registration, -errno on failure.  Note that this
3260  * function currently returns 0 as long as @cfts registration is successful
3261  * even if some file creation attempts on existing cgroups fail.
3262  */
cgroup_add_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3263 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3264 {
3265 	int ret;
3266 
3267 	if (ss->disabled)
3268 		return 0;
3269 
3270 	if (!cfts || cfts[0].name[0] == '\0')
3271 		return 0;
3272 
3273 	ret = cgroup_init_cftypes(ss, cfts);
3274 	if (ret)
3275 		return ret;
3276 
3277 	mutex_lock(&cgroup_mutex);
3278 
3279 	list_add_tail(&cfts->node, &ss->cfts);
3280 	ret = cgroup_apply_cftypes(cfts, true);
3281 	if (ret)
3282 		cgroup_rm_cftypes_locked(cfts);
3283 
3284 	mutex_unlock(&cgroup_mutex);
3285 	return ret;
3286 }
3287 
3288 /**
3289  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3290  * @ss: target cgroup subsystem
3291  * @cfts: zero-length name terminated array of cftypes
3292  *
3293  * Similar to cgroup_add_cftypes() but the added files are only used for
3294  * the default hierarchy.
3295  */
cgroup_add_dfl_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3296 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3297 {
3298 	struct cftype *cft;
3299 
3300 	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3301 		cft->flags |= __CFTYPE_ONLY_ON_DFL;
3302 	return cgroup_add_cftypes(ss, cfts);
3303 }
3304 
3305 /**
3306  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3307  * @ss: target cgroup subsystem
3308  * @cfts: zero-length name terminated array of cftypes
3309  *
3310  * Similar to cgroup_add_cftypes() but the added files are only used for
3311  * the legacy hierarchies.
3312  */
cgroup_add_legacy_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3313 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3314 {
3315 	struct cftype *cft;
3316 
3317 	/*
3318 	 * If legacy_flies_on_dfl, we want to show the legacy files on the
3319 	 * dfl hierarchy but iff the target subsystem hasn't been updated
3320 	 * for the dfl hierarchy yet.
3321 	 */
3322 	if (!cgroup_legacy_files_on_dfl ||
3323 	    ss->dfl_cftypes != ss->legacy_cftypes) {
3324 		for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3325 			cft->flags |= __CFTYPE_NOT_ON_DFL;
3326 	}
3327 
3328 	return cgroup_add_cftypes(ss, cfts);
3329 }
3330 
3331 /**
3332  * cgroup_task_count - count the number of tasks in a cgroup.
3333  * @cgrp: the cgroup in question
3334  *
3335  * Return the number of tasks in the cgroup.
3336  */
cgroup_task_count(const struct cgroup * cgrp)3337 static int cgroup_task_count(const struct cgroup *cgrp)
3338 {
3339 	int count = 0;
3340 	struct cgrp_cset_link *link;
3341 
3342 	down_read(&css_set_rwsem);
3343 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
3344 		count += atomic_read(&link->cset->refcount);
3345 	up_read(&css_set_rwsem);
3346 	return count;
3347 }
3348 
3349 /**
3350  * css_next_child - find the next child of a given css
3351  * @pos: the current position (%NULL to initiate traversal)
3352  * @parent: css whose children to walk
3353  *
3354  * This function returns the next child of @parent and should be called
3355  * under either cgroup_mutex or RCU read lock.  The only requirement is
3356  * that @parent and @pos are accessible.  The next sibling is guaranteed to
3357  * be returned regardless of their states.
3358  *
3359  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3360  * css which finished ->css_online() is guaranteed to be visible in the
3361  * future iterations and will stay visible until the last reference is put.
3362  * A css which hasn't finished ->css_online() or already finished
3363  * ->css_offline() may show up during traversal.  It's each subsystem's
3364  * responsibility to synchronize against on/offlining.
3365  */
css_next_child(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * parent)3366 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3367 					   struct cgroup_subsys_state *parent)
3368 {
3369 	struct cgroup_subsys_state *next;
3370 
3371 	cgroup_assert_mutex_or_rcu_locked();
3372 
3373 	/*
3374 	 * @pos could already have been unlinked from the sibling list.
3375 	 * Once a cgroup is removed, its ->sibling.next is no longer
3376 	 * updated when its next sibling changes.  CSS_RELEASED is set when
3377 	 * @pos is taken off list, at which time its next pointer is valid,
3378 	 * and, as releases are serialized, the one pointed to by the next
3379 	 * pointer is guaranteed to not have started release yet.  This
3380 	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
3381 	 * critical section, the one pointed to by its next pointer is
3382 	 * guaranteed to not have finished its RCU grace period even if we
3383 	 * have dropped rcu_read_lock() inbetween iterations.
3384 	 *
3385 	 * If @pos has CSS_RELEASED set, its next pointer can't be
3386 	 * dereferenced; however, as each css is given a monotonically
3387 	 * increasing unique serial number and always appended to the
3388 	 * sibling list, the next one can be found by walking the parent's
3389 	 * children until the first css with higher serial number than
3390 	 * @pos's.  While this path can be slower, it happens iff iteration
3391 	 * races against release and the race window is very small.
3392 	 */
3393 	if (!pos) {
3394 		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3395 	} else if (likely(!(pos->flags & CSS_RELEASED))) {
3396 		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3397 	} else {
3398 		list_for_each_entry_rcu(next, &parent->children, sibling)
3399 			if (next->serial_nr > pos->serial_nr)
3400 				break;
3401 	}
3402 
3403 	/*
3404 	 * @next, if not pointing to the head, can be dereferenced and is
3405 	 * the next sibling.
3406 	 */
3407 	if (&next->sibling != &parent->children)
3408 		return next;
3409 	return NULL;
3410 }
3411 
3412 /**
3413  * css_next_descendant_pre - find the next descendant for pre-order walk
3414  * @pos: the current position (%NULL to initiate traversal)
3415  * @root: css whose descendants to walk
3416  *
3417  * To be used by css_for_each_descendant_pre().  Find the next descendant
3418  * to visit for pre-order traversal of @root's descendants.  @root is
3419  * included in the iteration and the first node to be visited.
3420  *
3421  * While this function requires cgroup_mutex or RCU read locking, it
3422  * doesn't require the whole traversal to be contained in a single critical
3423  * section.  This function will return the correct next descendant as long
3424  * as both @pos and @root are accessible and @pos is a descendant of @root.
3425  *
3426  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3427  * css which finished ->css_online() is guaranteed to be visible in the
3428  * future iterations and will stay visible until the last reference is put.
3429  * A css which hasn't finished ->css_online() or already finished
3430  * ->css_offline() may show up during traversal.  It's each subsystem's
3431  * responsibility to synchronize against on/offlining.
3432  */
3433 struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * root)3434 css_next_descendant_pre(struct cgroup_subsys_state *pos,
3435 			struct cgroup_subsys_state *root)
3436 {
3437 	struct cgroup_subsys_state *next;
3438 
3439 	cgroup_assert_mutex_or_rcu_locked();
3440 
3441 	/* if first iteration, visit @root */
3442 	if (!pos)
3443 		return root;
3444 
3445 	/* visit the first child if exists */
3446 	next = css_next_child(NULL, pos);
3447 	if (next)
3448 		return next;
3449 
3450 	/* no child, visit my or the closest ancestor's next sibling */
3451 	while (pos != root) {
3452 		next = css_next_child(pos, pos->parent);
3453 		if (next)
3454 			return next;
3455 		pos = pos->parent;
3456 	}
3457 
3458 	return NULL;
3459 }
3460 
3461 /**
3462  * css_rightmost_descendant - return the rightmost descendant of a css
3463  * @pos: css of interest
3464  *
3465  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3466  * is returned.  This can be used during pre-order traversal to skip
3467  * subtree of @pos.
3468  *
3469  * While this function requires cgroup_mutex or RCU read locking, it
3470  * doesn't require the whole traversal to be contained in a single critical
3471  * section.  This function will return the correct rightmost descendant as
3472  * long as @pos is accessible.
3473  */
3474 struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state * pos)3475 css_rightmost_descendant(struct cgroup_subsys_state *pos)
3476 {
3477 	struct cgroup_subsys_state *last, *tmp;
3478 
3479 	cgroup_assert_mutex_or_rcu_locked();
3480 
3481 	do {
3482 		last = pos;
3483 		/* ->prev isn't RCU safe, walk ->next till the end */
3484 		pos = NULL;
3485 		css_for_each_child(tmp, last)
3486 			pos = tmp;
3487 	} while (pos);
3488 
3489 	return last;
3490 }
3491 
3492 static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state * pos)3493 css_leftmost_descendant(struct cgroup_subsys_state *pos)
3494 {
3495 	struct cgroup_subsys_state *last;
3496 
3497 	do {
3498 		last = pos;
3499 		pos = css_next_child(NULL, pos);
3500 	} while (pos);
3501 
3502 	return last;
3503 }
3504 
3505 /**
3506  * css_next_descendant_post - find the next descendant for post-order walk
3507  * @pos: the current position (%NULL to initiate traversal)
3508  * @root: css whose descendants to walk
3509  *
3510  * To be used by css_for_each_descendant_post().  Find the next descendant
3511  * to visit for post-order traversal of @root's descendants.  @root is
3512  * included in the iteration and the last node to be visited.
3513  *
3514  * While this function requires cgroup_mutex or RCU read locking, it
3515  * doesn't require the whole traversal to be contained in a single critical
3516  * section.  This function will return the correct next descendant as long
3517  * as both @pos and @cgroup are accessible and @pos is a descendant of
3518  * @cgroup.
3519  *
3520  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3521  * css which finished ->css_online() is guaranteed to be visible in the
3522  * future iterations and will stay visible until the last reference is put.
3523  * A css which hasn't finished ->css_online() or already finished
3524  * ->css_offline() may show up during traversal.  It's each subsystem's
3525  * responsibility to synchronize against on/offlining.
3526  */
3527 struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * root)3528 css_next_descendant_post(struct cgroup_subsys_state *pos,
3529 			 struct cgroup_subsys_state *root)
3530 {
3531 	struct cgroup_subsys_state *next;
3532 
3533 	cgroup_assert_mutex_or_rcu_locked();
3534 
3535 	/* if first iteration, visit leftmost descendant which may be @root */
3536 	if (!pos)
3537 		return css_leftmost_descendant(root);
3538 
3539 	/* if we visited @root, we're done */
3540 	if (pos == root)
3541 		return NULL;
3542 
3543 	/* if there's an unvisited sibling, visit its leftmost descendant */
3544 	next = css_next_child(pos, pos->parent);
3545 	if (next)
3546 		return css_leftmost_descendant(next);
3547 
3548 	/* no sibling left, visit parent */
3549 	return pos->parent;
3550 }
3551 
3552 /**
3553  * css_has_online_children - does a css have online children
3554  * @css: the target css
3555  *
3556  * Returns %true if @css has any online children; otherwise, %false.  This
3557  * function can be called from any context but the caller is responsible
3558  * for synchronizing against on/offlining as necessary.
3559  */
css_has_online_children(struct cgroup_subsys_state * css)3560 bool css_has_online_children(struct cgroup_subsys_state *css)
3561 {
3562 	struct cgroup_subsys_state *child;
3563 	bool ret = false;
3564 
3565 	rcu_read_lock();
3566 	css_for_each_child(child, css) {
3567 		if (child->flags & CSS_ONLINE) {
3568 			ret = true;
3569 			break;
3570 		}
3571 	}
3572 	rcu_read_unlock();
3573 	return ret;
3574 }
3575 
3576 /**
3577  * css_advance_task_iter - advance a task itererator to the next css_set
3578  * @it: the iterator to advance
3579  *
3580  * Advance @it to the next css_set to walk.
3581  */
css_advance_task_iter(struct css_task_iter * it)3582 static void css_advance_task_iter(struct css_task_iter *it)
3583 {
3584 	struct list_head *l = it->cset_pos;
3585 	struct cgrp_cset_link *link;
3586 	struct css_set *cset;
3587 
3588 	/* Advance to the next non-empty css_set */
3589 	do {
3590 		l = l->next;
3591 		if (l == it->cset_head) {
3592 			it->cset_pos = NULL;
3593 			return;
3594 		}
3595 
3596 		if (it->ss) {
3597 			cset = container_of(l, struct css_set,
3598 					    e_cset_node[it->ss->id]);
3599 		} else {
3600 			link = list_entry(l, struct cgrp_cset_link, cset_link);
3601 			cset = link->cset;
3602 		}
3603 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
3604 
3605 	it->cset_pos = l;
3606 
3607 	if (!list_empty(&cset->tasks))
3608 		it->task_pos = cset->tasks.next;
3609 	else
3610 		it->task_pos = cset->mg_tasks.next;
3611 
3612 	it->tasks_head = &cset->tasks;
3613 	it->mg_tasks_head = &cset->mg_tasks;
3614 }
3615 
3616 /**
3617  * css_task_iter_start - initiate task iteration
3618  * @css: the css to walk tasks of
3619  * @it: the task iterator to use
3620  *
3621  * Initiate iteration through the tasks of @css.  The caller can call
3622  * css_task_iter_next() to walk through the tasks until the function
3623  * returns NULL.  On completion of iteration, css_task_iter_end() must be
3624  * called.
3625  *
3626  * Note that this function acquires a lock which is released when the
3627  * iteration finishes.  The caller can't sleep while iteration is in
3628  * progress.
3629  */
css_task_iter_start(struct cgroup_subsys_state * css,struct css_task_iter * it)3630 void css_task_iter_start(struct cgroup_subsys_state *css,
3631 			 struct css_task_iter *it)
3632 	__acquires(css_set_rwsem)
3633 {
3634 	/* no one should try to iterate before mounting cgroups */
3635 	WARN_ON_ONCE(!use_task_css_set_links);
3636 
3637 	down_read(&css_set_rwsem);
3638 
3639 	it->ss = css->ss;
3640 
3641 	if (it->ss)
3642 		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3643 	else
3644 		it->cset_pos = &css->cgroup->cset_links;
3645 
3646 	it->cset_head = it->cset_pos;
3647 
3648 	css_advance_task_iter(it);
3649 }
3650 
3651 /**
3652  * css_task_iter_next - return the next task for the iterator
3653  * @it: the task iterator being iterated
3654  *
3655  * The "next" function for task iteration.  @it should have been
3656  * initialized via css_task_iter_start().  Returns NULL when the iteration
3657  * reaches the end.
3658  */
css_task_iter_next(struct css_task_iter * it)3659 struct task_struct *css_task_iter_next(struct css_task_iter *it)
3660 {
3661 	struct task_struct *res;
3662 	struct list_head *l = it->task_pos;
3663 
3664 	/* If the iterator cg is NULL, we have no tasks */
3665 	if (!it->cset_pos)
3666 		return NULL;
3667 	res = list_entry(l, struct task_struct, cg_list);
3668 
3669 	/*
3670 	 * Advance iterator to find next entry.  cset->tasks is consumed
3671 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3672 	 * next cset.
3673 	 */
3674 	l = l->next;
3675 
3676 	if (l == it->tasks_head)
3677 		l = it->mg_tasks_head->next;
3678 
3679 	if (l == it->mg_tasks_head)
3680 		css_advance_task_iter(it);
3681 	else
3682 		it->task_pos = l;
3683 
3684 	return res;
3685 }
3686 
3687 /**
3688  * css_task_iter_end - finish task iteration
3689  * @it: the task iterator to finish
3690  *
3691  * Finish task iteration started by css_task_iter_start().
3692  */
css_task_iter_end(struct css_task_iter * it)3693 void css_task_iter_end(struct css_task_iter *it)
3694 	__releases(css_set_rwsem)
3695 {
3696 	up_read(&css_set_rwsem);
3697 }
3698 
3699 /**
3700  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3701  * @to: cgroup to which the tasks will be moved
3702  * @from: cgroup in which the tasks currently reside
3703  *
3704  * Locking rules between cgroup_post_fork() and the migration path
3705  * guarantee that, if a task is forking while being migrated, the new child
3706  * is guaranteed to be either visible in the source cgroup after the
3707  * parent's migration is complete or put into the target cgroup.  No task
3708  * can slip out of migration through forking.
3709  */
cgroup_transfer_tasks(struct cgroup * to,struct cgroup * from)3710 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3711 {
3712 	LIST_HEAD(preloaded_csets);
3713 	struct cgrp_cset_link *link;
3714 	struct css_task_iter it;
3715 	struct task_struct *task;
3716 	int ret;
3717 
3718 	mutex_lock(&cgroup_mutex);
3719 
3720 	/* all tasks in @from are being moved, all csets are source */
3721 	down_read(&css_set_rwsem);
3722 	list_for_each_entry(link, &from->cset_links, cset_link)
3723 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3724 	up_read(&css_set_rwsem);
3725 
3726 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3727 	if (ret)
3728 		goto out_err;
3729 
3730 	/*
3731 	 * Migrate tasks one-by-one until @form is empty.  This fails iff
3732 	 * ->can_attach() fails.
3733 	 */
3734 	do {
3735 		css_task_iter_start(&from->self, &it);
3736 		task = css_task_iter_next(&it);
3737 		if (task)
3738 			get_task_struct(task);
3739 		css_task_iter_end(&it);
3740 
3741 		if (task) {
3742 			ret = cgroup_migrate(to, task, false);
3743 			put_task_struct(task);
3744 		}
3745 	} while (task && !ret);
3746 out_err:
3747 	cgroup_migrate_finish(&preloaded_csets);
3748 	mutex_unlock(&cgroup_mutex);
3749 	return ret;
3750 }
3751 
3752 /*
3753  * Stuff for reading the 'tasks'/'procs' files.
3754  *
3755  * Reading this file can return large amounts of data if a cgroup has
3756  * *lots* of attached tasks. So it may need several calls to read(),
3757  * but we cannot guarantee that the information we produce is correct
3758  * unless we produce it entirely atomically.
3759  *
3760  */
3761 
3762 /* which pidlist file are we talking about? */
3763 enum cgroup_filetype {
3764 	CGROUP_FILE_PROCS,
3765 	CGROUP_FILE_TASKS,
3766 };
3767 
3768 /*
3769  * A pidlist is a list of pids that virtually represents the contents of one
3770  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3771  * a pair (one each for procs, tasks) for each pid namespace that's relevant
3772  * to the cgroup.
3773  */
3774 struct cgroup_pidlist {
3775 	/*
3776 	 * used to find which pidlist is wanted. doesn't change as long as
3777 	 * this particular list stays in the list.
3778 	*/
3779 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3780 	/* array of xids */
3781 	pid_t *list;
3782 	/* how many elements the above list has */
3783 	int length;
3784 	/* each of these stored in a list by its cgroup */
3785 	struct list_head links;
3786 	/* pointer to the cgroup we belong to, for list removal purposes */
3787 	struct cgroup *owner;
3788 	/* for delayed destruction */
3789 	struct delayed_work destroy_dwork;
3790 };
3791 
3792 /*
3793  * The following two functions "fix" the issue where there are more pids
3794  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3795  * TODO: replace with a kernel-wide solution to this problem
3796  */
3797 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
pidlist_allocate(int count)3798 static void *pidlist_allocate(int count)
3799 {
3800 	if (PIDLIST_TOO_LARGE(count))
3801 		return vmalloc(count * sizeof(pid_t));
3802 	else
3803 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3804 }
3805 
pidlist_free(void * p)3806 static void pidlist_free(void *p)
3807 {
3808 	kvfree(p);
3809 }
3810 
3811 /*
3812  * Used to destroy all pidlists lingering waiting for destroy timer.  None
3813  * should be left afterwards.
3814  */
cgroup_pidlist_destroy_all(struct cgroup * cgrp)3815 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3816 {
3817 	struct cgroup_pidlist *l, *tmp_l;
3818 
3819 	mutex_lock(&cgrp->pidlist_mutex);
3820 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3821 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3822 	mutex_unlock(&cgrp->pidlist_mutex);
3823 
3824 	flush_workqueue(cgroup_pidlist_destroy_wq);
3825 	BUG_ON(!list_empty(&cgrp->pidlists));
3826 }
3827 
cgroup_pidlist_destroy_work_fn(struct work_struct * work)3828 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3829 {
3830 	struct delayed_work *dwork = to_delayed_work(work);
3831 	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3832 						destroy_dwork);
3833 	struct cgroup_pidlist *tofree = NULL;
3834 
3835 	mutex_lock(&l->owner->pidlist_mutex);
3836 
3837 	/*
3838 	 * Destroy iff we didn't get queued again.  The state won't change
3839 	 * as destroy_dwork can only be queued while locked.
3840 	 */
3841 	if (!delayed_work_pending(dwork)) {
3842 		list_del(&l->links);
3843 		pidlist_free(l->list);
3844 		put_pid_ns(l->key.ns);
3845 		tofree = l;
3846 	}
3847 
3848 	mutex_unlock(&l->owner->pidlist_mutex);
3849 	kfree(tofree);
3850 }
3851 
3852 /*
3853  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3854  * Returns the number of unique elements.
3855  */
pidlist_uniq(pid_t * list,int length)3856 static int pidlist_uniq(pid_t *list, int length)
3857 {
3858 	int src, dest = 1;
3859 
3860 	/*
3861 	 * we presume the 0th element is unique, so i starts at 1. trivial
3862 	 * edge cases first; no work needs to be done for either
3863 	 */
3864 	if (length == 0 || length == 1)
3865 		return length;
3866 	/* src and dest walk down the list; dest counts unique elements */
3867 	for (src = 1; src < length; src++) {
3868 		/* find next unique element */
3869 		while (list[src] == list[src-1]) {
3870 			src++;
3871 			if (src == length)
3872 				goto after;
3873 		}
3874 		/* dest always points to where the next unique element goes */
3875 		list[dest] = list[src];
3876 		dest++;
3877 	}
3878 after:
3879 	return dest;
3880 }
3881 
3882 /*
3883  * The two pid files - task and cgroup.procs - guaranteed that the result
3884  * is sorted, which forced this whole pidlist fiasco.  As pid order is
3885  * different per namespace, each namespace needs differently sorted list,
3886  * making it impossible to use, for example, single rbtree of member tasks
3887  * sorted by task pointer.  As pidlists can be fairly large, allocating one
3888  * per open file is dangerous, so cgroup had to implement shared pool of
3889  * pidlists keyed by cgroup and namespace.
3890  *
3891  * All this extra complexity was caused by the original implementation
3892  * committing to an entirely unnecessary property.  In the long term, we
3893  * want to do away with it.  Explicitly scramble sort order if on the
3894  * default hierarchy so that no such expectation exists in the new
3895  * interface.
3896  *
3897  * Scrambling is done by swapping every two consecutive bits, which is
3898  * non-identity one-to-one mapping which disturbs sort order sufficiently.
3899  */
pid_fry(pid_t pid)3900 static pid_t pid_fry(pid_t pid)
3901 {
3902 	unsigned a = pid & 0x55555555;
3903 	unsigned b = pid & 0xAAAAAAAA;
3904 
3905 	return (a << 1) | (b >> 1);
3906 }
3907 
cgroup_pid_fry(struct cgroup * cgrp,pid_t pid)3908 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3909 {
3910 	if (cgroup_on_dfl(cgrp))
3911 		return pid_fry(pid);
3912 	else
3913 		return pid;
3914 }
3915 
cmppid(const void * a,const void * b)3916 static int cmppid(const void *a, const void *b)
3917 {
3918 	return *(pid_t *)a - *(pid_t *)b;
3919 }
3920 
fried_cmppid(const void * a,const void * b)3921 static int fried_cmppid(const void *a, const void *b)
3922 {
3923 	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3924 }
3925 
cgroup_pidlist_find(struct cgroup * cgrp,enum cgroup_filetype type)3926 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3927 						  enum cgroup_filetype type)
3928 {
3929 	struct cgroup_pidlist *l;
3930 	/* don't need task_nsproxy() if we're looking at ourself */
3931 	struct pid_namespace *ns = task_active_pid_ns(current);
3932 
3933 	lockdep_assert_held(&cgrp->pidlist_mutex);
3934 
3935 	list_for_each_entry(l, &cgrp->pidlists, links)
3936 		if (l->key.type == type && l->key.ns == ns)
3937 			return l;
3938 	return NULL;
3939 }
3940 
3941 /*
3942  * find the appropriate pidlist for our purpose (given procs vs tasks)
3943  * returns with the lock on that pidlist already held, and takes care
3944  * of the use count, or returns NULL with no locks held if we're out of
3945  * memory.
3946  */
cgroup_pidlist_find_create(struct cgroup * cgrp,enum cgroup_filetype type)3947 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3948 						enum cgroup_filetype type)
3949 {
3950 	struct cgroup_pidlist *l;
3951 
3952 	lockdep_assert_held(&cgrp->pidlist_mutex);
3953 
3954 	l = cgroup_pidlist_find(cgrp, type);
3955 	if (l)
3956 		return l;
3957 
3958 	/* entry not found; create a new one */
3959 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3960 	if (!l)
3961 		return l;
3962 
3963 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3964 	l->key.type = type;
3965 	/* don't need task_nsproxy() if we're looking at ourself */
3966 	l->key.ns = get_pid_ns(task_active_pid_ns(current));
3967 	l->owner = cgrp;
3968 	list_add(&l->links, &cgrp->pidlists);
3969 	return l;
3970 }
3971 
3972 /*
3973  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3974  */
pidlist_array_load(struct cgroup * cgrp,enum cgroup_filetype type,struct cgroup_pidlist ** lp)3975 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3976 			      struct cgroup_pidlist **lp)
3977 {
3978 	pid_t *array;
3979 	int length;
3980 	int pid, n = 0; /* used for populating the array */
3981 	struct css_task_iter it;
3982 	struct task_struct *tsk;
3983 	struct cgroup_pidlist *l;
3984 
3985 	lockdep_assert_held(&cgrp->pidlist_mutex);
3986 
3987 	/*
3988 	 * If cgroup gets more users after we read count, we won't have
3989 	 * enough space - tough.  This race is indistinguishable to the
3990 	 * caller from the case that the additional cgroup users didn't
3991 	 * show up until sometime later on.
3992 	 */
3993 	length = cgroup_task_count(cgrp);
3994 	array = pidlist_allocate(length);
3995 	if (!array)
3996 		return -ENOMEM;
3997 	/* now, populate the array */
3998 	css_task_iter_start(&cgrp->self, &it);
3999 	while ((tsk = css_task_iter_next(&it))) {
4000 		if (unlikely(n == length))
4001 			break;
4002 		/* get tgid or pid for procs or tasks file respectively */
4003 		if (type == CGROUP_FILE_PROCS)
4004 			pid = task_tgid_vnr(tsk);
4005 		else
4006 			pid = task_pid_vnr(tsk);
4007 		if (pid > 0) /* make sure to only use valid results */
4008 			array[n++] = pid;
4009 	}
4010 	css_task_iter_end(&it);
4011 	length = n;
4012 	/* now sort & (if procs) strip out duplicates */
4013 	if (cgroup_on_dfl(cgrp))
4014 		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4015 	else
4016 		sort(array, length, sizeof(pid_t), cmppid, NULL);
4017 	if (type == CGROUP_FILE_PROCS)
4018 		length = pidlist_uniq(array, length);
4019 
4020 	l = cgroup_pidlist_find_create(cgrp, type);
4021 	if (!l) {
4022 		pidlist_free(array);
4023 		return -ENOMEM;
4024 	}
4025 
4026 	/* store array, freeing old if necessary */
4027 	pidlist_free(l->list);
4028 	l->list = array;
4029 	l->length = length;
4030 	*lp = l;
4031 	return 0;
4032 }
4033 
4034 /**
4035  * cgroupstats_build - build and fill cgroupstats
4036  * @stats: cgroupstats to fill information into
4037  * @dentry: A dentry entry belonging to the cgroup for which stats have
4038  * been requested.
4039  *
4040  * Build and fill cgroupstats so that taskstats can export it to user
4041  * space.
4042  */
cgroupstats_build(struct cgroupstats * stats,struct dentry * dentry)4043 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4044 {
4045 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4046 	struct cgroup *cgrp;
4047 	struct css_task_iter it;
4048 	struct task_struct *tsk;
4049 
4050 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
4051 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4052 	    kernfs_type(kn) != KERNFS_DIR)
4053 		return -EINVAL;
4054 
4055 	mutex_lock(&cgroup_mutex);
4056 
4057 	/*
4058 	 * We aren't being called from kernfs and there's no guarantee on
4059 	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
4060 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
4061 	 */
4062 	rcu_read_lock();
4063 	cgrp = rcu_dereference(kn->priv);
4064 	if (!cgrp || cgroup_is_dead(cgrp)) {
4065 		rcu_read_unlock();
4066 		mutex_unlock(&cgroup_mutex);
4067 		return -ENOENT;
4068 	}
4069 	rcu_read_unlock();
4070 
4071 	css_task_iter_start(&cgrp->self, &it);
4072 	while ((tsk = css_task_iter_next(&it))) {
4073 		switch (tsk->state) {
4074 		case TASK_RUNNING:
4075 			stats->nr_running++;
4076 			break;
4077 		case TASK_INTERRUPTIBLE:
4078 			stats->nr_sleeping++;
4079 			break;
4080 		case TASK_UNINTERRUPTIBLE:
4081 			stats->nr_uninterruptible++;
4082 			break;
4083 		case TASK_STOPPED:
4084 			stats->nr_stopped++;
4085 			break;
4086 		default:
4087 			if (delayacct_is_task_waiting_on_io(tsk))
4088 				stats->nr_io_wait++;
4089 			break;
4090 		}
4091 	}
4092 	css_task_iter_end(&it);
4093 
4094 	mutex_unlock(&cgroup_mutex);
4095 	return 0;
4096 }
4097 
4098 
4099 /*
4100  * seq_file methods for the tasks/procs files. The seq_file position is the
4101  * next pid to display; the seq_file iterator is a pointer to the pid
4102  * in the cgroup->l->list array.
4103  */
4104 
cgroup_pidlist_start(struct seq_file * s,loff_t * pos)4105 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4106 {
4107 	/*
4108 	 * Initially we receive a position value that corresponds to
4109 	 * one more than the last pid shown (or 0 on the first call or
4110 	 * after a seek to the start). Use a binary-search to find the
4111 	 * next pid to display, if any
4112 	 */
4113 	struct kernfs_open_file *of = s->private;
4114 	struct cgroup *cgrp = seq_css(s)->cgroup;
4115 	struct cgroup_pidlist *l;
4116 	enum cgroup_filetype type = seq_cft(s)->private;
4117 	int index = 0, pid = *pos;
4118 	int *iter, ret;
4119 
4120 	mutex_lock(&cgrp->pidlist_mutex);
4121 
4122 	/*
4123 	 * !NULL @of->priv indicates that this isn't the first start()
4124 	 * after open.  If the matching pidlist is around, we can use that.
4125 	 * Look for it.  Note that @of->priv can't be used directly.  It
4126 	 * could already have been destroyed.
4127 	 */
4128 	if (of->priv)
4129 		of->priv = cgroup_pidlist_find(cgrp, type);
4130 
4131 	/*
4132 	 * Either this is the first start() after open or the matching
4133 	 * pidlist has been destroyed inbetween.  Create a new one.
4134 	 */
4135 	if (!of->priv) {
4136 		ret = pidlist_array_load(cgrp, type,
4137 					 (struct cgroup_pidlist **)&of->priv);
4138 		if (ret)
4139 			return ERR_PTR(ret);
4140 	}
4141 	l = of->priv;
4142 
4143 	if (pid) {
4144 		int end = l->length;
4145 
4146 		while (index < end) {
4147 			int mid = (index + end) / 2;
4148 			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4149 				index = mid;
4150 				break;
4151 			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4152 				index = mid + 1;
4153 			else
4154 				end = mid;
4155 		}
4156 	}
4157 	/* If we're off the end of the array, we're done */
4158 	if (index >= l->length)
4159 		return NULL;
4160 	/* Update the abstract position to be the actual pid that we found */
4161 	iter = l->list + index;
4162 	*pos = cgroup_pid_fry(cgrp, *iter);
4163 	return iter;
4164 }
4165 
cgroup_pidlist_stop(struct seq_file * s,void * v)4166 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4167 {
4168 	struct kernfs_open_file *of = s->private;
4169 	struct cgroup_pidlist *l = of->priv;
4170 
4171 	if (l)
4172 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4173 				 CGROUP_PIDLIST_DESTROY_DELAY);
4174 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4175 }
4176 
cgroup_pidlist_next(struct seq_file * s,void * v,loff_t * pos)4177 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4178 {
4179 	struct kernfs_open_file *of = s->private;
4180 	struct cgroup_pidlist *l = of->priv;
4181 	pid_t *p = v;
4182 	pid_t *end = l->list + l->length;
4183 	/*
4184 	 * Advance to the next pid in the array. If this goes off the
4185 	 * end, we're done
4186 	 */
4187 	p++;
4188 	if (p >= end) {
4189 		return NULL;
4190 	} else {
4191 		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4192 		return p;
4193 	}
4194 }
4195 
cgroup_pidlist_show(struct seq_file * s,void * v)4196 static int cgroup_pidlist_show(struct seq_file *s, void *v)
4197 {
4198 	seq_printf(s, "%d\n", *(int *)v);
4199 
4200 	return 0;
4201 }
4202 
cgroup_read_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft)4203 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4204 					 struct cftype *cft)
4205 {
4206 	return notify_on_release(css->cgroup);
4207 }
4208 
cgroup_write_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4209 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4210 					  struct cftype *cft, u64 val)
4211 {
4212 	if (val)
4213 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4214 	else
4215 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4216 	return 0;
4217 }
4218 
cgroup_clone_children_read(struct cgroup_subsys_state * css,struct cftype * cft)4219 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4220 				      struct cftype *cft)
4221 {
4222 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4223 }
4224 
cgroup_clone_children_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4225 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4226 				       struct cftype *cft, u64 val)
4227 {
4228 	if (val)
4229 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4230 	else
4231 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4232 	return 0;
4233 }
4234 
4235 /* cgroup core interface files for the default hierarchy */
4236 static struct cftype cgroup_dfl_base_files[] = {
4237 	{
4238 		.name = "cgroup.procs",
4239 		.seq_start = cgroup_pidlist_start,
4240 		.seq_next = cgroup_pidlist_next,
4241 		.seq_stop = cgroup_pidlist_stop,
4242 		.seq_show = cgroup_pidlist_show,
4243 		.private = CGROUP_FILE_PROCS,
4244 		.write = cgroup_procs_write,
4245 		.mode = S_IRUGO | S_IWUSR,
4246 	},
4247 	{
4248 		.name = "cgroup.controllers",
4249 		.flags = CFTYPE_ONLY_ON_ROOT,
4250 		.seq_show = cgroup_root_controllers_show,
4251 	},
4252 	{
4253 		.name = "cgroup.controllers",
4254 		.flags = CFTYPE_NOT_ON_ROOT,
4255 		.seq_show = cgroup_controllers_show,
4256 	},
4257 	{
4258 		.name = "cgroup.subtree_control",
4259 		.seq_show = cgroup_subtree_control_show,
4260 		.write = cgroup_subtree_control_write,
4261 	},
4262 	{
4263 		.name = "cgroup.populated",
4264 		.flags = CFTYPE_NOT_ON_ROOT,
4265 		.seq_show = cgroup_populated_show,
4266 	},
4267 	{ }	/* terminate */
4268 };
4269 
4270 /* cgroup core interface files for the legacy hierarchies */
4271 static struct cftype cgroup_legacy_base_files[] = {
4272 	{
4273 		.name = "cgroup.procs",
4274 		.seq_start = cgroup_pidlist_start,
4275 		.seq_next = cgroup_pidlist_next,
4276 		.seq_stop = cgroup_pidlist_stop,
4277 		.seq_show = cgroup_pidlist_show,
4278 		.private = CGROUP_FILE_PROCS,
4279 		.write = cgroup_procs_write,
4280 		.mode = S_IRUGO | S_IWUSR,
4281 	},
4282 	{
4283 		.name = "cgroup.clone_children",
4284 		.read_u64 = cgroup_clone_children_read,
4285 		.write_u64 = cgroup_clone_children_write,
4286 	},
4287 	{
4288 		.name = "cgroup.sane_behavior",
4289 		.flags = CFTYPE_ONLY_ON_ROOT,
4290 		.seq_show = cgroup_sane_behavior_show,
4291 	},
4292 	{
4293 		.name = "tasks",
4294 		.seq_start = cgroup_pidlist_start,
4295 		.seq_next = cgroup_pidlist_next,
4296 		.seq_stop = cgroup_pidlist_stop,
4297 		.seq_show = cgroup_pidlist_show,
4298 		.private = CGROUP_FILE_TASKS,
4299 		.write = cgroup_tasks_write,
4300 		.mode = S_IRUGO | S_IWUSR,
4301 	},
4302 	{
4303 		.name = "notify_on_release",
4304 		.read_u64 = cgroup_read_notify_on_release,
4305 		.write_u64 = cgroup_write_notify_on_release,
4306 	},
4307 	{
4308 		.name = "release_agent",
4309 		.flags = CFTYPE_ONLY_ON_ROOT,
4310 		.seq_show = cgroup_release_agent_show,
4311 		.write = cgroup_release_agent_write,
4312 		.max_write_len = PATH_MAX - 1,
4313 	},
4314 	{ }	/* terminate */
4315 };
4316 
4317 /**
4318  * cgroup_populate_dir - create subsys files in a cgroup directory
4319  * @cgrp: target cgroup
4320  * @subsys_mask: mask of the subsystem ids whose files should be added
4321  *
4322  * On failure, no file is added.
4323  */
cgroup_populate_dir(struct cgroup * cgrp,unsigned int subsys_mask)4324 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
4325 {
4326 	struct cgroup_subsys *ss;
4327 	int i, ret = 0;
4328 
4329 	/* process cftsets of each subsystem */
4330 	for_each_subsys(ss, i) {
4331 		struct cftype *cfts;
4332 
4333 		if (!(subsys_mask & (1 << i)))
4334 			continue;
4335 
4336 		list_for_each_entry(cfts, &ss->cfts, node) {
4337 			ret = cgroup_addrm_files(cgrp, cfts, true);
4338 			if (ret < 0)
4339 				goto err;
4340 		}
4341 	}
4342 	return 0;
4343 err:
4344 	cgroup_clear_dir(cgrp, subsys_mask);
4345 	return ret;
4346 }
4347 
4348 /*
4349  * css destruction is four-stage process.
4350  *
4351  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4352  *    Implemented in kill_css().
4353  *
4354  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4355  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4356  *    offlined by invoking offline_css().  After offlining, the base ref is
4357  *    put.  Implemented in css_killed_work_fn().
4358  *
4359  * 3. When the percpu_ref reaches zero, the only possible remaining
4360  *    accessors are inside RCU read sections.  css_release() schedules the
4361  *    RCU callback.
4362  *
4363  * 4. After the grace period, the css can be freed.  Implemented in
4364  *    css_free_work_fn().
4365  *
4366  * It is actually hairier because both step 2 and 4 require process context
4367  * and thus involve punting to css->destroy_work adding two additional
4368  * steps to the already complex sequence.
4369  */
css_free_work_fn(struct work_struct * work)4370 static void css_free_work_fn(struct work_struct *work)
4371 {
4372 	struct cgroup_subsys_state *css =
4373 		container_of(work, struct cgroup_subsys_state, destroy_work);
4374 	struct cgroup_subsys *ss = css->ss;
4375 	struct cgroup *cgrp = css->cgroup;
4376 
4377 	percpu_ref_exit(&css->refcnt);
4378 
4379 	if (ss) {
4380 		/* css free path */
4381 		int id = css->id;
4382 
4383 		if (css->parent)
4384 			css_put(css->parent);
4385 
4386 		ss->css_free(css);
4387 		cgroup_idr_remove(&ss->css_idr, id);
4388 		cgroup_put(cgrp);
4389 	} else {
4390 		/* cgroup free path */
4391 		atomic_dec(&cgrp->root->nr_cgrps);
4392 		cgroup_pidlist_destroy_all(cgrp);
4393 		cancel_work_sync(&cgrp->release_agent_work);
4394 
4395 		if (cgroup_parent(cgrp)) {
4396 			/*
4397 			 * We get a ref to the parent, and put the ref when
4398 			 * this cgroup is being freed, so it's guaranteed
4399 			 * that the parent won't be destroyed before its
4400 			 * children.
4401 			 */
4402 			cgroup_put(cgroup_parent(cgrp));
4403 			kernfs_put(cgrp->kn);
4404 			kfree(cgrp);
4405 		} else {
4406 			/*
4407 			 * This is root cgroup's refcnt reaching zero,
4408 			 * which indicates that the root should be
4409 			 * released.
4410 			 */
4411 			cgroup_destroy_root(cgrp->root);
4412 		}
4413 	}
4414 }
4415 
css_free_rcu_fn(struct rcu_head * rcu_head)4416 static void css_free_rcu_fn(struct rcu_head *rcu_head)
4417 {
4418 	struct cgroup_subsys_state *css =
4419 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4420 
4421 	INIT_WORK(&css->destroy_work, css_free_work_fn);
4422 	queue_work(cgroup_destroy_wq, &css->destroy_work);
4423 }
4424 
css_release_work_fn(struct work_struct * work)4425 static void css_release_work_fn(struct work_struct *work)
4426 {
4427 	struct cgroup_subsys_state *css =
4428 		container_of(work, struct cgroup_subsys_state, destroy_work);
4429 	struct cgroup_subsys *ss = css->ss;
4430 	struct cgroup *cgrp = css->cgroup;
4431 
4432 	mutex_lock(&cgroup_mutex);
4433 
4434 	css->flags |= CSS_RELEASED;
4435 	list_del_rcu(&css->sibling);
4436 
4437 	if (ss) {
4438 		/* css release path */
4439 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4440 		if (ss->css_released)
4441 			ss->css_released(css);
4442 	} else {
4443 		/* cgroup release path */
4444 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4445 		cgrp->id = -1;
4446 
4447 		/*
4448 		 * There are two control paths which try to determine
4449 		 * cgroup from dentry without going through kernfs -
4450 		 * cgroupstats_build() and css_tryget_online_from_dir().
4451 		 * Those are supported by RCU protecting clearing of
4452 		 * cgrp->kn->priv backpointer.
4453 		 */
4454 		RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4455 	}
4456 
4457 	mutex_unlock(&cgroup_mutex);
4458 
4459 	call_rcu(&css->rcu_head, css_free_rcu_fn);
4460 }
4461 
css_release(struct percpu_ref * ref)4462 static void css_release(struct percpu_ref *ref)
4463 {
4464 	struct cgroup_subsys_state *css =
4465 		container_of(ref, struct cgroup_subsys_state, refcnt);
4466 
4467 	INIT_WORK(&css->destroy_work, css_release_work_fn);
4468 	queue_work(cgroup_destroy_wq, &css->destroy_work);
4469 }
4470 
init_and_link_css(struct cgroup_subsys_state * css,struct cgroup_subsys * ss,struct cgroup * cgrp)4471 static void init_and_link_css(struct cgroup_subsys_state *css,
4472 			      struct cgroup_subsys *ss, struct cgroup *cgrp)
4473 {
4474 	lockdep_assert_held(&cgroup_mutex);
4475 
4476 	cgroup_get(cgrp);
4477 
4478 	memset(css, 0, sizeof(*css));
4479 	css->cgroup = cgrp;
4480 	css->ss = ss;
4481 	INIT_LIST_HEAD(&css->sibling);
4482 	INIT_LIST_HEAD(&css->children);
4483 	css->serial_nr = css_serial_nr_next++;
4484 	atomic_set(&css->online_cnt, 0);
4485 
4486 	if (cgroup_parent(cgrp)) {
4487 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4488 		css_get(css->parent);
4489 	}
4490 
4491 	BUG_ON(cgroup_css(cgrp, ss));
4492 }
4493 
4494 /* invoke ->css_online() on a new CSS and mark it online if successful */
online_css(struct cgroup_subsys_state * css)4495 static int online_css(struct cgroup_subsys_state *css)
4496 {
4497 	struct cgroup_subsys *ss = css->ss;
4498 	int ret = 0;
4499 
4500 	lockdep_assert_held(&cgroup_mutex);
4501 
4502 	if (ss->css_online)
4503 		ret = ss->css_online(css);
4504 	if (!ret) {
4505 		css->flags |= CSS_ONLINE;
4506 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4507 
4508 		atomic_inc(&css->online_cnt);
4509 		if (css->parent)
4510 			atomic_inc(&css->parent->online_cnt);
4511 	}
4512 	return ret;
4513 }
4514 
4515 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
offline_css(struct cgroup_subsys_state * css)4516 static void offline_css(struct cgroup_subsys_state *css)
4517 {
4518 	struct cgroup_subsys *ss = css->ss;
4519 
4520 	lockdep_assert_held(&cgroup_mutex);
4521 
4522 	if (!(css->flags & CSS_ONLINE))
4523 		return;
4524 
4525 	if (ss->css_offline)
4526 		ss->css_offline(css);
4527 
4528 	css->flags &= ~CSS_ONLINE;
4529 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4530 
4531 	wake_up_all(&css->cgroup->offline_waitq);
4532 }
4533 
4534 /**
4535  * create_css - create a cgroup_subsys_state
4536  * @cgrp: the cgroup new css will be associated with
4537  * @ss: the subsys of new css
4538  * @visible: whether to create control knobs for the new css or not
4539  *
4540  * Create a new css associated with @cgrp - @ss pair.  On success, the new
4541  * css is online and installed in @cgrp with all interface files created if
4542  * @visible.  Returns 0 on success, -errno on failure.
4543  */
create_css(struct cgroup * cgrp,struct cgroup_subsys * ss,bool visible)4544 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4545 		      bool visible)
4546 {
4547 	struct cgroup *parent = cgroup_parent(cgrp);
4548 	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4549 	struct cgroup_subsys_state *css;
4550 	int err;
4551 
4552 	lockdep_assert_held(&cgroup_mutex);
4553 
4554 	css = ss->css_alloc(parent_css);
4555 	if (IS_ERR(css))
4556 		return PTR_ERR(css);
4557 
4558 	init_and_link_css(css, ss, cgrp);
4559 
4560 	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4561 	if (err)
4562 		goto err_free_css;
4563 
4564 	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4565 	if (err < 0)
4566 		goto err_free_percpu_ref;
4567 	css->id = err;
4568 
4569 	if (visible) {
4570 		err = cgroup_populate_dir(cgrp, 1 << ss->id);
4571 		if (err)
4572 			goto err_free_id;
4573 	}
4574 
4575 	/* @css is ready to be brought online now, make it visible */
4576 	list_add_tail_rcu(&css->sibling, &parent_css->children);
4577 	cgroup_idr_replace(&ss->css_idr, css, css->id);
4578 
4579 	err = online_css(css);
4580 	if (err)
4581 		goto err_list_del;
4582 
4583 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4584 	    cgroup_parent(parent)) {
4585 		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4586 			current->comm, current->pid, ss->name);
4587 		if (!strcmp(ss->name, "memory"))
4588 			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4589 		ss->warned_broken_hierarchy = true;
4590 	}
4591 
4592 	return 0;
4593 
4594 err_list_del:
4595 	list_del_rcu(&css->sibling);
4596 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4597 err_free_id:
4598 	cgroup_idr_remove(&ss->css_idr, css->id);
4599 err_free_percpu_ref:
4600 	percpu_ref_exit(&css->refcnt);
4601 err_free_css:
4602 	call_rcu(&css->rcu_head, css_free_rcu_fn);
4603 	return err;
4604 }
4605 
cgroup_mkdir(struct kernfs_node * parent_kn,const char * name,umode_t mode)4606 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4607 			umode_t mode)
4608 {
4609 	struct cgroup *parent, *cgrp;
4610 	struct cgroup_root *root;
4611 	struct cgroup_subsys *ss;
4612 	struct kernfs_node *kn;
4613 	struct cftype *base_files;
4614 	int ssid, ret;
4615 
4616 	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
4617 	 */
4618 	if (strchr(name, '\n'))
4619 		return -EINVAL;
4620 
4621 	parent = cgroup_kn_lock_live(parent_kn);
4622 	if (!parent)
4623 		return -ENODEV;
4624 	root = parent->root;
4625 
4626 	/* allocate the cgroup and its ID, 0 is reserved for the root */
4627 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4628 	if (!cgrp) {
4629 		ret = -ENOMEM;
4630 		goto out_unlock;
4631 	}
4632 
4633 	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4634 	if (ret)
4635 		goto out_free_cgrp;
4636 
4637 	/*
4638 	 * Temporarily set the pointer to NULL, so idr_find() won't return
4639 	 * a half-baked cgroup.
4640 	 */
4641 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
4642 	if (cgrp->id < 0) {
4643 		ret = -ENOMEM;
4644 		goto out_cancel_ref;
4645 	}
4646 
4647 	init_cgroup_housekeeping(cgrp);
4648 
4649 	cgrp->self.parent = &parent->self;
4650 	cgrp->root = root;
4651 
4652 	if (notify_on_release(parent))
4653 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4654 
4655 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4656 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4657 
4658 	/* create the directory */
4659 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4660 	if (IS_ERR(kn)) {
4661 		ret = PTR_ERR(kn);
4662 		goto out_free_id;
4663 	}
4664 	cgrp->kn = kn;
4665 
4666 	/*
4667 	 * This extra ref will be put in cgroup_free_fn() and guarantees
4668 	 * that @cgrp->kn is always accessible.
4669 	 */
4670 	kernfs_get(kn);
4671 
4672 	cgrp->self.serial_nr = css_serial_nr_next++;
4673 
4674 	/* allocation complete, commit to creation */
4675 	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4676 	atomic_inc(&root->nr_cgrps);
4677 	cgroup_get(parent);
4678 
4679 	/*
4680 	 * @cgrp is now fully operational.  If something fails after this
4681 	 * point, it'll be released via the normal destruction path.
4682 	 */
4683 	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4684 
4685 	ret = cgroup_kn_set_ugid(kn);
4686 	if (ret)
4687 		goto out_destroy;
4688 
4689 	if (cgroup_on_dfl(cgrp))
4690 		base_files = cgroup_dfl_base_files;
4691 	else
4692 		base_files = cgroup_legacy_base_files;
4693 
4694 	ret = cgroup_addrm_files(cgrp, base_files, true);
4695 	if (ret)
4696 		goto out_destroy;
4697 
4698 	/* let's create and online css's */
4699 	for_each_subsys(ss, ssid) {
4700 		if (parent->child_subsys_mask & (1 << ssid)) {
4701 			ret = create_css(cgrp, ss,
4702 					 parent->subtree_control & (1 << ssid));
4703 			if (ret)
4704 				goto out_destroy;
4705 		}
4706 	}
4707 
4708 	/*
4709 	 * On the default hierarchy, a child doesn't automatically inherit
4710 	 * subtree_control from the parent.  Each is configured manually.
4711 	 */
4712 	if (!cgroup_on_dfl(cgrp)) {
4713 		cgrp->subtree_control = parent->subtree_control;
4714 		cgroup_refresh_child_subsys_mask(cgrp);
4715 	}
4716 
4717 	kernfs_activate(kn);
4718 
4719 	ret = 0;
4720 	goto out_unlock;
4721 
4722 out_free_id:
4723 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4724 out_cancel_ref:
4725 	percpu_ref_exit(&cgrp->self.refcnt);
4726 out_free_cgrp:
4727 	kfree(cgrp);
4728 out_unlock:
4729 	cgroup_kn_unlock(parent_kn);
4730 	return ret;
4731 
4732 out_destroy:
4733 	cgroup_destroy_locked(cgrp);
4734 	goto out_unlock;
4735 }
4736 
4737 /*
4738  * This is called when the refcnt of a css is confirmed to be killed.
4739  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
4740  * initate destruction and put the css ref from kill_css().
4741  */
css_killed_work_fn(struct work_struct * work)4742 static void css_killed_work_fn(struct work_struct *work)
4743 {
4744 	struct cgroup_subsys_state *css =
4745 		container_of(work, struct cgroup_subsys_state, destroy_work);
4746 
4747 	mutex_lock(&cgroup_mutex);
4748 
4749 	do {
4750 		offline_css(css);
4751 		css_put(css);
4752 		/* @css can't go away while we're holding cgroup_mutex */
4753 		css = css->parent;
4754 	} while (css && atomic_dec_and_test(&css->online_cnt));
4755 
4756 	mutex_unlock(&cgroup_mutex);
4757 }
4758 
4759 /* css kill confirmation processing requires process context, bounce */
css_killed_ref_fn(struct percpu_ref * ref)4760 static void css_killed_ref_fn(struct percpu_ref *ref)
4761 {
4762 	struct cgroup_subsys_state *css =
4763 		container_of(ref, struct cgroup_subsys_state, refcnt);
4764 
4765 	if (atomic_dec_and_test(&css->online_cnt)) {
4766 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
4767 		queue_work(cgroup_destroy_wq, &css->destroy_work);
4768 	}
4769 }
4770 
4771 /**
4772  * kill_css - destroy a css
4773  * @css: css to destroy
4774  *
4775  * This function initiates destruction of @css by removing cgroup interface
4776  * files and putting its base reference.  ->css_offline() will be invoked
4777  * asynchronously once css_tryget_online() is guaranteed to fail and when
4778  * the reference count reaches zero, @css will be released.
4779  */
kill_css(struct cgroup_subsys_state * css)4780 static void kill_css(struct cgroup_subsys_state *css)
4781 {
4782 	lockdep_assert_held(&cgroup_mutex);
4783 
4784 	/*
4785 	 * This must happen before css is disassociated with its cgroup.
4786 	 * See seq_css() for details.
4787 	 */
4788 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4789 
4790 	/*
4791 	 * Killing would put the base ref, but we need to keep it alive
4792 	 * until after ->css_offline().
4793 	 */
4794 	css_get(css);
4795 
4796 	/*
4797 	 * cgroup core guarantees that, by the time ->css_offline() is
4798 	 * invoked, no new css reference will be given out via
4799 	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
4800 	 * proceed to offlining css's because percpu_ref_kill() doesn't
4801 	 * guarantee that the ref is seen as killed on all CPUs on return.
4802 	 *
4803 	 * Use percpu_ref_kill_and_confirm() to get notifications as each
4804 	 * css is confirmed to be seen as killed on all CPUs.
4805 	 */
4806 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4807 }
4808 
4809 /**
4810  * cgroup_destroy_locked - the first stage of cgroup destruction
4811  * @cgrp: cgroup to be destroyed
4812  *
4813  * css's make use of percpu refcnts whose killing latency shouldn't be
4814  * exposed to userland and are RCU protected.  Also, cgroup core needs to
4815  * guarantee that css_tryget_online() won't succeed by the time
4816  * ->css_offline() is invoked.  To satisfy all the requirements,
4817  * destruction is implemented in the following two steps.
4818  *
4819  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4820  *     userland visible parts and start killing the percpu refcnts of
4821  *     css's.  Set up so that the next stage will be kicked off once all
4822  *     the percpu refcnts are confirmed to be killed.
4823  *
4824  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4825  *     rest of destruction.  Once all cgroup references are gone, the
4826  *     cgroup is RCU-freed.
4827  *
4828  * This function implements s1.  After this step, @cgrp is gone as far as
4829  * the userland is concerned and a new cgroup with the same name may be
4830  * created.  As cgroup doesn't care about the names internally, this
4831  * doesn't cause any problem.
4832  */
cgroup_destroy_locked(struct cgroup * cgrp)4833 static int cgroup_destroy_locked(struct cgroup *cgrp)
4834 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4835 {
4836 	struct cgroup_subsys_state *css;
4837 	bool empty;
4838 	int ssid;
4839 
4840 	lockdep_assert_held(&cgroup_mutex);
4841 
4842 	/*
4843 	 * css_set_rwsem synchronizes access to ->cset_links and prevents
4844 	 * @cgrp from being removed while put_css_set() is in progress.
4845 	 */
4846 	down_read(&css_set_rwsem);
4847 	empty = list_empty(&cgrp->cset_links);
4848 	up_read(&css_set_rwsem);
4849 	if (!empty)
4850 		return -EBUSY;
4851 
4852 	/*
4853 	 * Make sure there's no live children.  We can't test emptiness of
4854 	 * ->self.children as dead children linger on it while being
4855 	 * drained; otherwise, "rmdir parent/child parent" may fail.
4856 	 */
4857 	if (css_has_online_children(&cgrp->self))
4858 		return -EBUSY;
4859 
4860 	/*
4861 	 * Mark @cgrp dead.  This prevents further task migration and child
4862 	 * creation by disabling cgroup_lock_live_group().
4863 	 */
4864 	cgrp->self.flags &= ~CSS_ONLINE;
4865 
4866 	/* initiate massacre of all css's */
4867 	for_each_css(css, ssid, cgrp)
4868 		kill_css(css);
4869 
4870 	/*
4871 	 * Remove @cgrp directory along with the base files.  @cgrp has an
4872 	 * extra ref on its kn.
4873 	 */
4874 	kernfs_remove(cgrp->kn);
4875 
4876 	check_for_release(cgroup_parent(cgrp));
4877 
4878 	/* put the base reference */
4879 	percpu_ref_kill(&cgrp->self.refcnt);
4880 
4881 	return 0;
4882 };
4883 
cgroup_rmdir(struct kernfs_node * kn)4884 static int cgroup_rmdir(struct kernfs_node *kn)
4885 {
4886 	struct cgroup *cgrp;
4887 	int ret = 0;
4888 
4889 	cgrp = cgroup_kn_lock_live(kn);
4890 	if (!cgrp)
4891 		return 0;
4892 
4893 	ret = cgroup_destroy_locked(cgrp);
4894 
4895 	cgroup_kn_unlock(kn);
4896 	return ret;
4897 }
4898 
4899 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4900 	.remount_fs		= cgroup_remount,
4901 	.show_options		= cgroup_show_options,
4902 	.mkdir			= cgroup_mkdir,
4903 	.rmdir			= cgroup_rmdir,
4904 	.rename			= cgroup_rename,
4905 };
4906 
cgroup_init_subsys(struct cgroup_subsys * ss,bool early)4907 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4908 {
4909 	struct cgroup_subsys_state *css;
4910 
4911 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4912 
4913 	mutex_lock(&cgroup_mutex);
4914 
4915 	idr_init(&ss->css_idr);
4916 	INIT_LIST_HEAD(&ss->cfts);
4917 
4918 	/* Create the root cgroup state for this subsystem */
4919 	ss->root = &cgrp_dfl_root;
4920 	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4921 	/* We don't handle early failures gracefully */
4922 	BUG_ON(IS_ERR(css));
4923 	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4924 
4925 	/*
4926 	 * Root csses are never destroyed and we can't initialize
4927 	 * percpu_ref during early init.  Disable refcnting.
4928 	 */
4929 	css->flags |= CSS_NO_REF;
4930 
4931 	if (early) {
4932 		/* allocation can't be done safely during early init */
4933 		css->id = 1;
4934 	} else {
4935 		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4936 		BUG_ON(css->id < 0);
4937 	}
4938 
4939 	/* Update the init_css_set to contain a subsys
4940 	 * pointer to this state - since the subsystem is
4941 	 * newly registered, all tasks and hence the
4942 	 * init_css_set is in the subsystem's root cgroup. */
4943 	init_css_set.subsys[ss->id] = css;
4944 
4945 	need_forkexit_callback |= ss->fork || ss->exit;
4946 
4947 	/* At system boot, before all subsystems have been
4948 	 * registered, no tasks have been forked, so we don't
4949 	 * need to invoke fork callbacks here. */
4950 	BUG_ON(!list_empty(&init_task.tasks));
4951 
4952 	BUG_ON(online_css(css));
4953 
4954 	mutex_unlock(&cgroup_mutex);
4955 }
4956 
4957 /**
4958  * cgroup_init_early - cgroup initialization at system boot
4959  *
4960  * Initialize cgroups at system boot, and initialize any
4961  * subsystems that request early init.
4962  */
cgroup_init_early(void)4963 int __init cgroup_init_early(void)
4964 {
4965 	static struct cgroup_sb_opts __initdata opts;
4966 	struct cgroup_subsys *ss;
4967 	int i;
4968 
4969 	init_cgroup_root(&cgrp_dfl_root, &opts);
4970 	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4971 
4972 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4973 
4974 	for_each_subsys(ss, i) {
4975 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4976 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4977 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4978 		     ss->id, ss->name);
4979 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4980 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4981 
4982 		ss->id = i;
4983 		ss->name = cgroup_subsys_name[i];
4984 
4985 		if (ss->early_init)
4986 			cgroup_init_subsys(ss, true);
4987 	}
4988 	return 0;
4989 }
4990 
4991 /**
4992  * cgroup_init - cgroup initialization
4993  *
4994  * Register cgroup filesystem and /proc file, and initialize
4995  * any subsystems that didn't request early init.
4996  */
cgroup_init(void)4997 int __init cgroup_init(void)
4998 {
4999 	struct cgroup_subsys *ss;
5000 	unsigned long key;
5001 	int ssid, err;
5002 
5003 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5004 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5005 
5006 	mutex_lock(&cgroup_mutex);
5007 
5008 	/* Add init_css_set to the hash table */
5009 	key = css_set_hash(init_css_set.subsys);
5010 	hash_add(css_set_table, &init_css_set.hlist, key);
5011 
5012 	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5013 
5014 	mutex_unlock(&cgroup_mutex);
5015 
5016 	for_each_subsys(ss, ssid) {
5017 		if (ss->early_init) {
5018 			struct cgroup_subsys_state *css =
5019 				init_css_set.subsys[ss->id];
5020 
5021 			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5022 						   GFP_KERNEL);
5023 			BUG_ON(css->id < 0);
5024 		} else {
5025 			cgroup_init_subsys(ss, false);
5026 		}
5027 
5028 		list_add_tail(&init_css_set.e_cset_node[ssid],
5029 			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
5030 
5031 		/*
5032 		 * Setting dfl_root subsys_mask needs to consider the
5033 		 * disabled flag and cftype registration needs kmalloc,
5034 		 * both of which aren't available during early_init.
5035 		 */
5036 		if (ss->disabled)
5037 			continue;
5038 
5039 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5040 
5041 		if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
5042 			ss->dfl_cftypes = ss->legacy_cftypes;
5043 
5044 		if (!ss->dfl_cftypes)
5045 			cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
5046 
5047 		if (ss->dfl_cftypes == ss->legacy_cftypes) {
5048 			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5049 		} else {
5050 			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5051 			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5052 		}
5053 
5054 		if (ss->bind)
5055 			ss->bind(init_css_set.subsys[ssid]);
5056 	}
5057 
5058 	err = sysfs_create_mount_point(fs_kobj, "cgroup");
5059 	if (err)
5060 		return err;
5061 
5062 	err = register_filesystem(&cgroup_fs_type);
5063 	if (err < 0) {
5064 		sysfs_remove_mount_point(fs_kobj, "cgroup");
5065 		return err;
5066 	}
5067 
5068 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5069 	return 0;
5070 }
5071 
cgroup_wq_init(void)5072 static int __init cgroup_wq_init(void)
5073 {
5074 	/*
5075 	 * There isn't much point in executing destruction path in
5076 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
5077 	 * Use 1 for @max_active.
5078 	 *
5079 	 * We would prefer to do this in cgroup_init() above, but that
5080 	 * is called before init_workqueues(): so leave this until after.
5081 	 */
5082 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5083 	BUG_ON(!cgroup_destroy_wq);
5084 
5085 	/*
5086 	 * Used to destroy pidlists and separate to serve as flush domain.
5087 	 * Cap @max_active to 1 too.
5088 	 */
5089 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5090 						    0, 1);
5091 	BUG_ON(!cgroup_pidlist_destroy_wq);
5092 
5093 	return 0;
5094 }
5095 core_initcall(cgroup_wq_init);
5096 
5097 /*
5098  * proc_cgroup_show()
5099  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
5100  *  - Used for /proc/<pid>/cgroup.
5101  */
proc_cgroup_show(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)5102 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5103 		     struct pid *pid, struct task_struct *tsk)
5104 {
5105 	char *buf, *path;
5106 	int retval;
5107 	struct cgroup_root *root;
5108 
5109 	retval = -ENOMEM;
5110 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5111 	if (!buf)
5112 		goto out;
5113 
5114 	mutex_lock(&cgroup_mutex);
5115 	down_read(&css_set_rwsem);
5116 
5117 	for_each_root(root) {
5118 		struct cgroup_subsys *ss;
5119 		struct cgroup *cgrp;
5120 		int ssid, count = 0;
5121 
5122 		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
5123 			continue;
5124 
5125 		seq_printf(m, "%d:", root->hierarchy_id);
5126 		for_each_subsys(ss, ssid)
5127 			if (root->subsys_mask & (1 << ssid))
5128 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5129 		if (strlen(root->name))
5130 			seq_printf(m, "%sname=%s", count ? "," : "",
5131 				   root->name);
5132 		seq_putc(m, ':');
5133 		cgrp = task_cgroup_from_root(tsk, root);
5134 		path = cgroup_path(cgrp, buf, PATH_MAX);
5135 		if (!path) {
5136 			retval = -ENAMETOOLONG;
5137 			goto out_unlock;
5138 		}
5139 		seq_puts(m, path);
5140 		seq_putc(m, '\n');
5141 	}
5142 
5143 	retval = 0;
5144 out_unlock:
5145 	up_read(&css_set_rwsem);
5146 	mutex_unlock(&cgroup_mutex);
5147 	kfree(buf);
5148 out:
5149 	return retval;
5150 }
5151 
5152 /* Display information about each subsystem and each hierarchy */
proc_cgroupstats_show(struct seq_file * m,void * v)5153 static int proc_cgroupstats_show(struct seq_file *m, void *v)
5154 {
5155 	struct cgroup_subsys *ss;
5156 	int i;
5157 
5158 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5159 	/*
5160 	 * ideally we don't want subsystems moving around while we do this.
5161 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5162 	 * subsys/hierarchy state.
5163 	 */
5164 	mutex_lock(&cgroup_mutex);
5165 
5166 	for_each_subsys(ss, i)
5167 		seq_printf(m, "%s\t%d\t%d\t%d\n",
5168 			   ss->name, ss->root->hierarchy_id,
5169 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
5170 
5171 	mutex_unlock(&cgroup_mutex);
5172 	return 0;
5173 }
5174 
cgroupstats_open(struct inode * inode,struct file * file)5175 static int cgroupstats_open(struct inode *inode, struct file *file)
5176 {
5177 	return single_open(file, proc_cgroupstats_show, NULL);
5178 }
5179 
5180 static const struct file_operations proc_cgroupstats_operations = {
5181 	.open = cgroupstats_open,
5182 	.read = seq_read,
5183 	.llseek = seq_lseek,
5184 	.release = single_release,
5185 };
5186 
5187 /**
5188  * cgroup_fork - initialize cgroup related fields during copy_process()
5189  * @child: pointer to task_struct of forking parent process.
5190  *
5191  * A task is associated with the init_css_set until cgroup_post_fork()
5192  * attaches it to the parent's css_set.  Empty cg_list indicates that
5193  * @child isn't holding reference to its css_set.
5194  */
cgroup_fork(struct task_struct * child)5195 void cgroup_fork(struct task_struct *child)
5196 {
5197 	RCU_INIT_POINTER(child->cgroups, &init_css_set);
5198 	INIT_LIST_HEAD(&child->cg_list);
5199 }
5200 
5201 /**
5202  * cgroup_post_fork - called on a new task after adding it to the task list
5203  * @child: the task in question
5204  *
5205  * Adds the task to the list running through its css_set if necessary and
5206  * call the subsystem fork() callbacks.  Has to be after the task is
5207  * visible on the task list in case we race with the first call to
5208  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5209  * list.
5210  */
cgroup_post_fork(struct task_struct * child)5211 void cgroup_post_fork(struct task_struct *child)
5212 {
5213 	struct cgroup_subsys *ss;
5214 	int i;
5215 
5216 	/*
5217 	 * This may race against cgroup_enable_task_cg_lists().  As that
5218 	 * function sets use_task_css_set_links before grabbing
5219 	 * tasklist_lock and we just went through tasklist_lock to add
5220 	 * @child, it's guaranteed that either we see the set
5221 	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5222 	 * @child during its iteration.
5223 	 *
5224 	 * If we won the race, @child is associated with %current's
5225 	 * css_set.  Grabbing css_set_rwsem guarantees both that the
5226 	 * association is stable, and, on completion of the parent's
5227 	 * migration, @child is visible in the source of migration or
5228 	 * already in the destination cgroup.  This guarantee is necessary
5229 	 * when implementing operations which need to migrate all tasks of
5230 	 * a cgroup to another.
5231 	 *
5232 	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5233 	 * will remain in init_css_set.  This is safe because all tasks are
5234 	 * in the init_css_set before cg_links is enabled and there's no
5235 	 * operation which transfers all tasks out of init_css_set.
5236 	 */
5237 	if (use_task_css_set_links) {
5238 		struct css_set *cset;
5239 
5240 		down_write(&css_set_rwsem);
5241 		cset = task_css_set(current);
5242 		if (list_empty(&child->cg_list)) {
5243 			rcu_assign_pointer(child->cgroups, cset);
5244 			list_add(&child->cg_list, &cset->tasks);
5245 			get_css_set(cset);
5246 		}
5247 		up_write(&css_set_rwsem);
5248 	}
5249 
5250 	/*
5251 	 * Call ss->fork().  This must happen after @child is linked on
5252 	 * css_set; otherwise, @child might change state between ->fork()
5253 	 * and addition to css_set.
5254 	 */
5255 	if (need_forkexit_callback) {
5256 		for_each_subsys(ss, i)
5257 			if (ss->fork)
5258 				ss->fork(child);
5259 	}
5260 }
5261 
5262 /**
5263  * cgroup_exit - detach cgroup from exiting task
5264  * @tsk: pointer to task_struct of exiting process
5265  *
5266  * Description: Detach cgroup from @tsk and release it.
5267  *
5268  * Note that cgroups marked notify_on_release force every task in
5269  * them to take the global cgroup_mutex mutex when exiting.
5270  * This could impact scaling on very large systems.  Be reluctant to
5271  * use notify_on_release cgroups where very high task exit scaling
5272  * is required on large systems.
5273  *
5274  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
5275  * call cgroup_exit() while the task is still competent to handle
5276  * notify_on_release(), then leave the task attached to the root cgroup in
5277  * each hierarchy for the remainder of its exit.  No need to bother with
5278  * init_css_set refcnting.  init_css_set never goes away and we can't race
5279  * with migration path - PF_EXITING is visible to migration path.
5280  */
cgroup_exit(struct task_struct * tsk)5281 void cgroup_exit(struct task_struct *tsk)
5282 {
5283 	struct cgroup_subsys *ss;
5284 	struct css_set *cset;
5285 	bool put_cset = false;
5286 	int i;
5287 
5288 	/*
5289 	 * Unlink from @tsk from its css_set.  As migration path can't race
5290 	 * with us, we can check cg_list without grabbing css_set_rwsem.
5291 	 */
5292 	if (!list_empty(&tsk->cg_list)) {
5293 		down_write(&css_set_rwsem);
5294 		list_del_init(&tsk->cg_list);
5295 		up_write(&css_set_rwsem);
5296 		put_cset = true;
5297 	}
5298 
5299 	/* Reassign the task to the init_css_set. */
5300 	cset = task_css_set(tsk);
5301 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5302 
5303 	if (need_forkexit_callback) {
5304 		/* see cgroup_post_fork() for details */
5305 		for_each_subsys(ss, i) {
5306 			if (ss->exit) {
5307 				struct cgroup_subsys_state *old_css = cset->subsys[i];
5308 				struct cgroup_subsys_state *css = task_css(tsk, i);
5309 
5310 				ss->exit(css, old_css, tsk);
5311 			}
5312 		}
5313 	}
5314 
5315 	if (put_cset)
5316 		put_css_set(cset);
5317 }
5318 
check_for_release(struct cgroup * cgrp)5319 static void check_for_release(struct cgroup *cgrp)
5320 {
5321 	if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
5322 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5323 		schedule_work(&cgrp->release_agent_work);
5324 }
5325 
5326 /*
5327  * Notify userspace when a cgroup is released, by running the
5328  * configured release agent with the name of the cgroup (path
5329  * relative to the root of cgroup file system) as the argument.
5330  *
5331  * Most likely, this user command will try to rmdir this cgroup.
5332  *
5333  * This races with the possibility that some other task will be
5334  * attached to this cgroup before it is removed, or that some other
5335  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5336  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5337  * unused, and this cgroup will be reprieved from its death sentence,
5338  * to continue to serve a useful existence.  Next time it's released,
5339  * we will get notified again, if it still has 'notify_on_release' set.
5340  *
5341  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5342  * means only wait until the task is successfully execve()'d.  The
5343  * separate release agent task is forked by call_usermodehelper(),
5344  * then control in this thread returns here, without waiting for the
5345  * release agent task.  We don't bother to wait because the caller of
5346  * this routine has no use for the exit status of the release agent
5347  * task, so no sense holding our caller up for that.
5348  */
cgroup_release_agent(struct work_struct * work)5349 static void cgroup_release_agent(struct work_struct *work)
5350 {
5351 	struct cgroup *cgrp =
5352 		container_of(work, struct cgroup, release_agent_work);
5353 	char *pathbuf = NULL, *agentbuf = NULL, *path;
5354 	char *argv[3], *envp[3];
5355 
5356 	mutex_lock(&cgroup_mutex);
5357 
5358 	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5359 	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5360 	if (!pathbuf || !agentbuf)
5361 		goto out;
5362 
5363 	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5364 	if (!path)
5365 		goto out;
5366 
5367 	argv[0] = agentbuf;
5368 	argv[1] = path;
5369 	argv[2] = NULL;
5370 
5371 	/* minimal command environment */
5372 	envp[0] = "HOME=/";
5373 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5374 	envp[2] = NULL;
5375 
5376 	mutex_unlock(&cgroup_mutex);
5377 	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5378 	goto out_free;
5379 out:
5380 	mutex_unlock(&cgroup_mutex);
5381 out_free:
5382 	kfree(agentbuf);
5383 	kfree(pathbuf);
5384 }
5385 
cgroup_disable(char * str)5386 static int __init cgroup_disable(char *str)
5387 {
5388 	struct cgroup_subsys *ss;
5389 	char *token;
5390 	int i;
5391 
5392 	while ((token = strsep(&str, ",")) != NULL) {
5393 		if (!*token)
5394 			continue;
5395 
5396 		for_each_subsys(ss, i) {
5397 			if (!strcmp(token, ss->name)) {
5398 				ss->disabled = 1;
5399 				printk(KERN_INFO "Disabling %s control group"
5400 					" subsystem\n", ss->name);
5401 				break;
5402 			}
5403 		}
5404 	}
5405 	return 1;
5406 }
5407 __setup("cgroup_disable=", cgroup_disable);
5408 
cgroup_set_legacy_files_on_dfl(char * str)5409 static int __init cgroup_set_legacy_files_on_dfl(char *str)
5410 {
5411 	printk("cgroup: using legacy files on the default hierarchy\n");
5412 	cgroup_legacy_files_on_dfl = true;
5413 	return 0;
5414 }
5415 __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5416 
5417 /**
5418  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5419  * @dentry: directory dentry of interest
5420  * @ss: subsystem of interest
5421  *
5422  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5423  * to get the corresponding css and return it.  If such css doesn't exist
5424  * or can't be pinned, an ERR_PTR value is returned.
5425  */
css_tryget_online_from_dir(struct dentry * dentry,struct cgroup_subsys * ss)5426 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5427 						       struct cgroup_subsys *ss)
5428 {
5429 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5430 	struct cgroup_subsys_state *css = NULL;
5431 	struct cgroup *cgrp;
5432 
5433 	/* is @dentry a cgroup dir? */
5434 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5435 	    kernfs_type(kn) != KERNFS_DIR)
5436 		return ERR_PTR(-EBADF);
5437 
5438 	rcu_read_lock();
5439 
5440 	/*
5441 	 * This path doesn't originate from kernfs and @kn could already
5442 	 * have been or be removed at any point.  @kn->priv is RCU
5443 	 * protected for this access.  See css_release_work_fn() for details.
5444 	 */
5445 	cgrp = rcu_dereference(kn->priv);
5446 	if (cgrp)
5447 		css = cgroup_css(cgrp, ss);
5448 
5449 	if (!css || !css_tryget_online(css))
5450 		css = ERR_PTR(-ENOENT);
5451 
5452 	rcu_read_unlock();
5453 	return css;
5454 }
5455 
5456 /**
5457  * css_from_id - lookup css by id
5458  * @id: the cgroup id
5459  * @ss: cgroup subsys to be looked into
5460  *
5461  * Returns the css if there's valid one with @id, otherwise returns NULL.
5462  * Should be called under rcu_read_lock().
5463  */
css_from_id(int id,struct cgroup_subsys * ss)5464 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5465 {
5466 	WARN_ON_ONCE(!rcu_read_lock_held());
5467 	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
5468 }
5469 
5470 #ifdef CONFIG_CGROUP_DEBUG
5471 static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state * parent_css)5472 debug_css_alloc(struct cgroup_subsys_state *parent_css)
5473 {
5474 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5475 
5476 	if (!css)
5477 		return ERR_PTR(-ENOMEM);
5478 
5479 	return css;
5480 }
5481 
debug_css_free(struct cgroup_subsys_state * css)5482 static void debug_css_free(struct cgroup_subsys_state *css)
5483 {
5484 	kfree(css);
5485 }
5486 
debug_taskcount_read(struct cgroup_subsys_state * css,struct cftype * cft)5487 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5488 				struct cftype *cft)
5489 {
5490 	return cgroup_task_count(css->cgroup);
5491 }
5492 
current_css_set_read(struct cgroup_subsys_state * css,struct cftype * cft)5493 static u64 current_css_set_read(struct cgroup_subsys_state *css,
5494 				struct cftype *cft)
5495 {
5496 	return (u64)(unsigned long)current->cgroups;
5497 }
5498 
current_css_set_refcount_read(struct cgroup_subsys_state * css,struct cftype * cft)5499 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5500 					 struct cftype *cft)
5501 {
5502 	u64 count;
5503 
5504 	rcu_read_lock();
5505 	count = atomic_read(&task_css_set(current)->refcount);
5506 	rcu_read_unlock();
5507 	return count;
5508 }
5509 
current_css_set_cg_links_read(struct seq_file * seq,void * v)5510 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5511 {
5512 	struct cgrp_cset_link *link;
5513 	struct css_set *cset;
5514 	char *name_buf;
5515 
5516 	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
5517 	if (!name_buf)
5518 		return -ENOMEM;
5519 
5520 	down_read(&css_set_rwsem);
5521 	rcu_read_lock();
5522 	cset = rcu_dereference(current->cgroups);
5523 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5524 		struct cgroup *c = link->cgrp;
5525 
5526 		cgroup_name(c, name_buf, NAME_MAX + 1);
5527 		seq_printf(seq, "Root %d group %s\n",
5528 			   c->root->hierarchy_id, name_buf);
5529 	}
5530 	rcu_read_unlock();
5531 	up_read(&css_set_rwsem);
5532 	kfree(name_buf);
5533 	return 0;
5534 }
5535 
5536 #define MAX_TASKS_SHOWN_PER_CSS 25
cgroup_css_links_read(struct seq_file * seq,void * v)5537 static int cgroup_css_links_read(struct seq_file *seq, void *v)
5538 {
5539 	struct cgroup_subsys_state *css = seq_css(seq);
5540 	struct cgrp_cset_link *link;
5541 
5542 	down_read(&css_set_rwsem);
5543 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5544 		struct css_set *cset = link->cset;
5545 		struct task_struct *task;
5546 		int count = 0;
5547 
5548 		seq_printf(seq, "css_set %p\n", cset);
5549 
5550 		list_for_each_entry(task, &cset->tasks, cg_list) {
5551 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5552 				goto overflow;
5553 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5554 		}
5555 
5556 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5557 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5558 				goto overflow;
5559 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5560 		}
5561 		continue;
5562 	overflow:
5563 		seq_puts(seq, "  ...\n");
5564 	}
5565 	up_read(&css_set_rwsem);
5566 	return 0;
5567 }
5568 
releasable_read(struct cgroup_subsys_state * css,struct cftype * cft)5569 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5570 {
5571 	return (!cgroup_has_tasks(css->cgroup) &&
5572 		!css_has_online_children(&css->cgroup->self));
5573 }
5574 
5575 static struct cftype debug_files[] =  {
5576 	{
5577 		.name = "taskcount",
5578 		.read_u64 = debug_taskcount_read,
5579 	},
5580 
5581 	{
5582 		.name = "current_css_set",
5583 		.read_u64 = current_css_set_read,
5584 	},
5585 
5586 	{
5587 		.name = "current_css_set_refcount",
5588 		.read_u64 = current_css_set_refcount_read,
5589 	},
5590 
5591 	{
5592 		.name = "current_css_set_cg_links",
5593 		.seq_show = current_css_set_cg_links_read,
5594 	},
5595 
5596 	{
5597 		.name = "cgroup_css_links",
5598 		.seq_show = cgroup_css_links_read,
5599 	},
5600 
5601 	{
5602 		.name = "releasable",
5603 		.read_u64 = releasable_read,
5604 	},
5605 
5606 	{ }	/* terminate */
5607 };
5608 
5609 struct cgroup_subsys debug_cgrp_subsys = {
5610 	.css_alloc = debug_css_alloc,
5611 	.css_free = debug_css_free,
5612 	.legacy_cftypes = debug_files,
5613 };
5614 #endif /* CONFIG_CGROUP_DEBUG */
5615