This source file includes following definitions.
- do_memsw_account
- should_force_charge
- memcg_to_vmpressure
- vmpressure_to_css
- memcg_get_cache_ids
- memcg_put_cache_ids
- memcg_free_shrinker_map_rcu
- memcg_expand_one_shrinker_map
- memcg_free_shrinker_maps
- memcg_alloc_shrinker_maps
- memcg_expand_shrinker_maps
- memcg_set_shrinker_bit
- mem_cgroup_css_from_page
- page_cgroup_ino
- mem_cgroup_page_nodeinfo
- soft_limit_tree_node
- soft_limit_tree_from_page
- __mem_cgroup_insert_exceeded
- __mem_cgroup_remove_exceeded
- mem_cgroup_remove_exceeded
- soft_limit_excess
- mem_cgroup_update_tree
- mem_cgroup_remove_from_trees
- __mem_cgroup_largest_soft_limit_node
- mem_cgroup_largest_soft_limit_node
- __mod_memcg_state
- parent_nodeinfo
- __mod_lruvec_state
- __mod_lruvec_slab_state
- mod_memcg_obj_state
- __count_memcg_events
- memcg_events
- memcg_events_local
- mem_cgroup_charge_statistics
- mem_cgroup_event_ratelimit
- memcg_check_events
- mem_cgroup_from_task
- get_mem_cgroup_from_mm
- get_mem_cgroup_from_page
- get_mem_cgroup_from_current
- mem_cgroup_iter
- mem_cgroup_iter_break
- __invalidate_reclaim_iterators
- invalidate_reclaim_iterators
- mem_cgroup_scan_tasks
- mem_cgroup_page_lruvec
- mem_cgroup_update_lru_size
- mem_cgroup_margin
- mem_cgroup_under_move
- mem_cgroup_wait_acct_move
- memory_stat_format
- mem_cgroup_print_oom_context
- mem_cgroup_print_oom_meminfo
- mem_cgroup_get_max
- mem_cgroup_size
- mem_cgroup_out_of_memory
- test_mem_cgroup_node_reclaimable
- mem_cgroup_may_update_nodemask
- mem_cgroup_select_victim_node
- mem_cgroup_select_victim_node
- mem_cgroup_soft_reclaim
- mem_cgroup_oom_trylock
- mem_cgroup_oom_unlock
- mem_cgroup_mark_under_oom
- mem_cgroup_unmark_under_oom
- memcg_oom_wake_function
- memcg_oom_recover
- mem_cgroup_oom
- mem_cgroup_oom_synchronize
- mem_cgroup_get_oom_group
- mem_cgroup_print_oom_group
- lock_page_memcg
- __unlock_page_memcg
- unlock_page_memcg
- consume_stock
- drain_stock
- drain_local_stock
- refill_stock
- drain_all_stock
- memcg_hotplug_cpu_dead
- reclaim_high
- high_work_func
- calculate_high_delay
- mem_cgroup_handle_over_high
- try_charge
- cancel_charge
- lock_page_lru
- unlock_page_lru
- commit_charge
- mem_cgroup_from_obj
- memcg_alloc_cache_id
- memcg_free_cache_id
- memcg_kmem_cache_create_func
- memcg_schedule_kmem_cache_create
- memcg_kmem_bypass
- memcg_kmem_get_cache
- memcg_kmem_put_cache
- __memcg_kmem_charge_memcg
- __memcg_kmem_charge
- __memcg_kmem_uncharge_memcg
- __memcg_kmem_uncharge
- mem_cgroup_split_huge_fixup
- mem_cgroup_move_swap_account
- mem_cgroup_move_swap_account
- mem_cgroup_resize_max
- mem_cgroup_soft_limit_reclaim
- memcg_has_children
- mem_cgroup_force_empty
- mem_cgroup_force_empty_write
- mem_cgroup_hierarchy_read
- mem_cgroup_hierarchy_write
- mem_cgroup_usage
- mem_cgroup_read_u64
- memcg_flush_percpu_vmstats
- memcg_flush_percpu_vmevents
- memcg_online_kmem
- memcg_offline_kmem
- memcg_free_kmem
- memcg_online_kmem
- memcg_offline_kmem
- memcg_free_kmem
- memcg_update_kmem_max
- memcg_update_tcp_max
- mem_cgroup_write
- mem_cgroup_reset
- mem_cgroup_move_charge_read
- mem_cgroup_move_charge_write
- mem_cgroup_move_charge_write
- mem_cgroup_node_nr_lru_pages
- mem_cgroup_nr_lru_pages
- memcg_numa_stat_show
- memcg_stat_show
- mem_cgroup_swappiness_read
- mem_cgroup_swappiness_write
- __mem_cgroup_threshold
- mem_cgroup_threshold
- compare_thresholds
- mem_cgroup_oom_notify_cb
- mem_cgroup_oom_notify
- __mem_cgroup_usage_register_event
- mem_cgroup_usage_register_event
- memsw_cgroup_usage_register_event
- __mem_cgroup_usage_unregister_event
- mem_cgroup_usage_unregister_event
- memsw_cgroup_usage_unregister_event
- mem_cgroup_oom_register_event
- mem_cgroup_oom_unregister_event
- mem_cgroup_oom_control_read
- mem_cgroup_oom_control_write
- memcg_wb_domain_init
- memcg_wb_domain_exit
- memcg_wb_domain_size_changed
- mem_cgroup_wb_domain
- memcg_exact_page_state
- mem_cgroup_wb_stats
- mem_cgroup_track_foreign_dirty_slowpath
- mem_cgroup_flush_foreign
- memcg_wb_domain_init
- memcg_wb_domain_exit
- memcg_wb_domain_size_changed
- memcg_event_remove
- memcg_event_wake
- memcg_event_ptable_queue_proc
- memcg_write_event_control
- mem_cgroup_id_remove
- mem_cgroup_id_get_many
- mem_cgroup_id_put_many
- mem_cgroup_id_put
- mem_cgroup_from_id
- alloc_mem_cgroup_per_node_info
- free_mem_cgroup_per_node_info
- __mem_cgroup_free
- mem_cgroup_free
- mem_cgroup_alloc
- mem_cgroup_css_alloc
- mem_cgroup_css_online
- mem_cgroup_css_offline
- mem_cgroup_css_released
- mem_cgroup_css_free
- mem_cgroup_css_reset
- mem_cgroup_do_precharge
- mc_handle_present_pte
- mc_handle_swap_pte
- mc_handle_swap_pte
- mc_handle_file_pte
- mem_cgroup_move_account
- get_mctgt_type
- get_mctgt_type_thp
- get_mctgt_type_thp
- mem_cgroup_count_precharge_pte_range
- mem_cgroup_count_precharge
- mem_cgroup_precharge_mc
- __mem_cgroup_clear_mc
- mem_cgroup_clear_mc
- mem_cgroup_can_attach
- mem_cgroup_cancel_attach
- mem_cgroup_move_charge_pte_range
- mem_cgroup_move_charge
- mem_cgroup_move_task
- mem_cgroup_can_attach
- mem_cgroup_cancel_attach
- mem_cgroup_move_task
- mem_cgroup_bind
- seq_puts_memcg_tunable
- memory_current_read
- memory_min_show
- memory_min_write
- memory_low_show
- memory_low_write
- memory_high_show
- memory_high_write
- memory_max_show
- memory_max_write
- __memory_events_show
- memory_events_show
- memory_events_local_show
- memory_stat_show
- memory_oom_group_show
- memory_oom_group_write
- mem_cgroup_protected
- mem_cgroup_try_charge
- mem_cgroup_try_charge_delay
- mem_cgroup_commit_charge
- mem_cgroup_cancel_charge
- uncharge_gather_clear
- uncharge_batch
- uncharge_page
- uncharge_list
- mem_cgroup_uncharge
- mem_cgroup_uncharge_list
- mem_cgroup_migrate
- mem_cgroup_sk_alloc
- mem_cgroup_sk_free
- mem_cgroup_charge_skmem
- mem_cgroup_uncharge_skmem
- cgroup_memory
- mem_cgroup_init
- mem_cgroup_id_get_online
- mem_cgroup_swapout
- mem_cgroup_try_charge_swap
- mem_cgroup_uncharge_swap
- mem_cgroup_get_nr_swap_pages
- mem_cgroup_swap_full
- enable_swap_account
- swap_current_read
- swap_max_show
- swap_max_write
- swap_events_show
- mem_cgroup_swap_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 #include <linux/page_counter.h>
26 #include <linux/memcontrol.h>
27 #include <linux/cgroup.h>
28 #include <linux/pagewalk.h>
29 #include <linux/sched/mm.h>
30 #include <linux/shmem_fs.h>
31 #include <linux/hugetlb.h>
32 #include <linux/pagemap.h>
33 #include <linux/vm_event_item.h>
34 #include <linux/smp.h>
35 #include <linux/page-flags.h>
36 #include <linux/backing-dev.h>
37 #include <linux/bit_spinlock.h>
38 #include <linux/rcupdate.h>
39 #include <linux/limits.h>
40 #include <linux/export.h>
41 #include <linux/mutex.h>
42 #include <linux/rbtree.h>
43 #include <linux/slab.h>
44 #include <linux/swap.h>
45 #include <linux/swapops.h>
46 #include <linux/spinlock.h>
47 #include <linux/eventfd.h>
48 #include <linux/poll.h>
49 #include <linux/sort.h>
50 #include <linux/fs.h>
51 #include <linux/seq_file.h>
52 #include <linux/vmpressure.h>
53 #include <linux/mm_inline.h>
54 #include <linux/swap_cgroup.h>
55 #include <linux/cpu.h>
56 #include <linux/oom.h>
57 #include <linux/lockdep.h>
58 #include <linux/file.h>
59 #include <linux/tracehook.h>
60 #include <linux/psi.h>
61 #include <linux/seq_buf.h>
62 #include "internal.h"
63 #include <net/sock.h>
64 #include <net/ip.h>
65 #include "slab.h"
66
67 #include <linux/uaccess.h>
68
69 #include <trace/events/vmscan.h>
70
71 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
72 EXPORT_SYMBOL(memory_cgrp_subsys);
73
74 struct mem_cgroup *root_mem_cgroup __read_mostly;
75
76 #define MEM_CGROUP_RECLAIM_RETRIES 5
77
78
79 static bool cgroup_memory_nosocket;
80
81
82 static bool cgroup_memory_nokmem;
83
84
85 #ifdef CONFIG_MEMCG_SWAP
86 int do_swap_account __read_mostly;
87 #else
88 #define do_swap_account 0
89 #endif
90
91 #ifdef CONFIG_CGROUP_WRITEBACK
92 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
93 #endif
94
95
96 static bool do_memsw_account(void)
97 {
98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
99 }
100
101 static const char *const mem_cgroup_lru_names[] = {
102 "inactive_anon",
103 "active_anon",
104 "inactive_file",
105 "active_file",
106 "unevictable",
107 };
108
109 #define THRESHOLDS_EVENTS_TARGET 128
110 #define SOFTLIMIT_EVENTS_TARGET 1024
111 #define NUMAINFO_EVENTS_TARGET 1024
112
113
114
115
116
117
118 struct mem_cgroup_tree_per_node {
119 struct rb_root rb_root;
120 struct rb_node *rb_rightmost;
121 spinlock_t lock;
122 };
123
124 struct mem_cgroup_tree {
125 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
126 };
127
128 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
129
130
131 struct mem_cgroup_eventfd_list {
132 struct list_head list;
133 struct eventfd_ctx *eventfd;
134 };
135
136
137
138
139 struct mem_cgroup_event {
140
141
142
143 struct mem_cgroup *memcg;
144
145
146
147 struct eventfd_ctx *eventfd;
148
149
150
151 struct list_head list;
152
153
154
155
156
157 int (*register_event)(struct mem_cgroup *memcg,
158 struct eventfd_ctx *eventfd, const char *args);
159
160
161
162
163
164 void (*unregister_event)(struct mem_cgroup *memcg,
165 struct eventfd_ctx *eventfd);
166
167
168
169
170 poll_table pt;
171 wait_queue_head_t *wqh;
172 wait_queue_entry_t wait;
173 struct work_struct remove;
174 };
175
176 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
177 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
178
179
180
181
182
183 #define MOVE_ANON 0x1U
184 #define MOVE_FILE 0x2U
185 #define MOVE_MASK (MOVE_ANON | MOVE_FILE)
186
187
188 static struct move_charge_struct {
189 spinlock_t lock;
190 struct mm_struct *mm;
191 struct mem_cgroup *from;
192 struct mem_cgroup *to;
193 unsigned long flags;
194 unsigned long precharge;
195 unsigned long moved_charge;
196 unsigned long moved_swap;
197 struct task_struct *moving_task;
198 wait_queue_head_t waitq;
199 } mc = {
200 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
201 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
202 };
203
204
205
206
207
208 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
209 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
210
211 enum charge_type {
212 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
213 MEM_CGROUP_CHARGE_TYPE_ANON,
214 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
215 MEM_CGROUP_CHARGE_TYPE_DROP,
216 NR_CHARGE_TYPE,
217 };
218
219
220 enum res_type {
221 _MEM,
222 _MEMSWAP,
223 _OOM_TYPE,
224 _KMEM,
225 _TCP,
226 };
227
228 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
229 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
230 #define MEMFILE_ATTR(val) ((val) & 0xffff)
231
232 #define OOM_CONTROL (0)
233
234
235
236
237
238
239 #define for_each_mem_cgroup_tree(iter, root) \
240 for (iter = mem_cgroup_iter(root, NULL, NULL); \
241 iter != NULL; \
242 iter = mem_cgroup_iter(root, iter, NULL))
243
244 #define for_each_mem_cgroup(iter) \
245 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
246 iter != NULL; \
247 iter = mem_cgroup_iter(NULL, iter, NULL))
248
249 static inline bool should_force_charge(void)
250 {
251 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
252 (current->flags & PF_EXITING);
253 }
254
255
256 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
257 {
258 if (!memcg)
259 memcg = root_mem_cgroup;
260 return &memcg->vmpressure;
261 }
262
263 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
264 {
265 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
266 }
267
268 #ifdef CONFIG_MEMCG_KMEM
269
270
271
272
273
274
275
276
277
278
279
280 static DEFINE_IDA(memcg_cache_ida);
281 int memcg_nr_cache_ids;
282
283
284 static DECLARE_RWSEM(memcg_cache_ids_sem);
285
286 void memcg_get_cache_ids(void)
287 {
288 down_read(&memcg_cache_ids_sem);
289 }
290
291 void memcg_put_cache_ids(void)
292 {
293 up_read(&memcg_cache_ids_sem);
294 }
295
296
297
298
299
300
301
302
303
304
305
306
307
308 #define MEMCG_CACHES_MIN_SIZE 4
309 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
310
311
312
313
314
315
316
317 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
318 EXPORT_SYMBOL(memcg_kmem_enabled_key);
319
320 struct workqueue_struct *memcg_kmem_cache_wq;
321 #endif
322
323 static int memcg_shrinker_map_size;
324 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
325
326 static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
327 {
328 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
329 }
330
331 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
332 int size, int old_size)
333 {
334 struct memcg_shrinker_map *new, *old;
335 int nid;
336
337 lockdep_assert_held(&memcg_shrinker_map_mutex);
338
339 for_each_node(nid) {
340 old = rcu_dereference_protected(
341 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
342
343 if (!old)
344 return 0;
345
346 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
347 if (!new)
348 return -ENOMEM;
349
350
351 memset(new->map, (int)0xff, old_size);
352 memset((void *)new->map + old_size, 0, size - old_size);
353
354 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
355 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
356 }
357
358 return 0;
359 }
360
361 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
362 {
363 struct mem_cgroup_per_node *pn;
364 struct memcg_shrinker_map *map;
365 int nid;
366
367 if (mem_cgroup_is_root(memcg))
368 return;
369
370 for_each_node(nid) {
371 pn = mem_cgroup_nodeinfo(memcg, nid);
372 map = rcu_dereference_protected(pn->shrinker_map, true);
373 if (map)
374 kvfree(map);
375 rcu_assign_pointer(pn->shrinker_map, NULL);
376 }
377 }
378
379 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
380 {
381 struct memcg_shrinker_map *map;
382 int nid, size, ret = 0;
383
384 if (mem_cgroup_is_root(memcg))
385 return 0;
386
387 mutex_lock(&memcg_shrinker_map_mutex);
388 size = memcg_shrinker_map_size;
389 for_each_node(nid) {
390 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
391 if (!map) {
392 memcg_free_shrinker_maps(memcg);
393 ret = -ENOMEM;
394 break;
395 }
396 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
397 }
398 mutex_unlock(&memcg_shrinker_map_mutex);
399
400 return ret;
401 }
402
403 int memcg_expand_shrinker_maps(int new_id)
404 {
405 int size, old_size, ret = 0;
406 struct mem_cgroup *memcg;
407
408 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
409 old_size = memcg_shrinker_map_size;
410 if (size <= old_size)
411 return 0;
412
413 mutex_lock(&memcg_shrinker_map_mutex);
414 if (!root_mem_cgroup)
415 goto unlock;
416
417 for_each_mem_cgroup(memcg) {
418 if (mem_cgroup_is_root(memcg))
419 continue;
420 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
421 if (ret) {
422 mem_cgroup_iter_break(NULL, memcg);
423 goto unlock;
424 }
425 }
426 unlock:
427 if (!ret)
428 memcg_shrinker_map_size = size;
429 mutex_unlock(&memcg_shrinker_map_mutex);
430 return ret;
431 }
432
433 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
434 {
435 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
436 struct memcg_shrinker_map *map;
437
438 rcu_read_lock();
439 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
440
441 smp_mb__before_atomic();
442 set_bit(shrinker_id, map->map);
443 rcu_read_unlock();
444 }
445 }
446
447
448
449
450
451
452
453
454
455
456
457
458 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
459 {
460 struct mem_cgroup *memcg;
461
462 memcg = page->mem_cgroup;
463
464 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
465 memcg = root_mem_cgroup;
466
467 return &memcg->css;
468 }
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483 ino_t page_cgroup_ino(struct page *page)
484 {
485 struct mem_cgroup *memcg;
486 unsigned long ino = 0;
487
488 rcu_read_lock();
489 if (PageSlab(page) && !PageTail(page))
490 memcg = memcg_from_slab_page(page);
491 else
492 memcg = READ_ONCE(page->mem_cgroup);
493 while (memcg && !(memcg->css.flags & CSS_ONLINE))
494 memcg = parent_mem_cgroup(memcg);
495 if (memcg)
496 ino = cgroup_ino(memcg->css.cgroup);
497 rcu_read_unlock();
498 return ino;
499 }
500
501 static struct mem_cgroup_per_node *
502 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
503 {
504 int nid = page_to_nid(page);
505
506 return memcg->nodeinfo[nid];
507 }
508
509 static struct mem_cgroup_tree_per_node *
510 soft_limit_tree_node(int nid)
511 {
512 return soft_limit_tree.rb_tree_per_node[nid];
513 }
514
515 static struct mem_cgroup_tree_per_node *
516 soft_limit_tree_from_page(struct page *page)
517 {
518 int nid = page_to_nid(page);
519
520 return soft_limit_tree.rb_tree_per_node[nid];
521 }
522
523 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
524 struct mem_cgroup_tree_per_node *mctz,
525 unsigned long new_usage_in_excess)
526 {
527 struct rb_node **p = &mctz->rb_root.rb_node;
528 struct rb_node *parent = NULL;
529 struct mem_cgroup_per_node *mz_node;
530 bool rightmost = true;
531
532 if (mz->on_tree)
533 return;
534
535 mz->usage_in_excess = new_usage_in_excess;
536 if (!mz->usage_in_excess)
537 return;
538 while (*p) {
539 parent = *p;
540 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
541 tree_node);
542 if (mz->usage_in_excess < mz_node->usage_in_excess) {
543 p = &(*p)->rb_left;
544 rightmost = false;
545 }
546
547
548
549
550
551 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
552 p = &(*p)->rb_right;
553 }
554
555 if (rightmost)
556 mctz->rb_rightmost = &mz->tree_node;
557
558 rb_link_node(&mz->tree_node, parent, p);
559 rb_insert_color(&mz->tree_node, &mctz->rb_root);
560 mz->on_tree = true;
561 }
562
563 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
564 struct mem_cgroup_tree_per_node *mctz)
565 {
566 if (!mz->on_tree)
567 return;
568
569 if (&mz->tree_node == mctz->rb_rightmost)
570 mctz->rb_rightmost = rb_prev(&mz->tree_node);
571
572 rb_erase(&mz->tree_node, &mctz->rb_root);
573 mz->on_tree = false;
574 }
575
576 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
577 struct mem_cgroup_tree_per_node *mctz)
578 {
579 unsigned long flags;
580
581 spin_lock_irqsave(&mctz->lock, flags);
582 __mem_cgroup_remove_exceeded(mz, mctz);
583 spin_unlock_irqrestore(&mctz->lock, flags);
584 }
585
586 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
587 {
588 unsigned long nr_pages = page_counter_read(&memcg->memory);
589 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
590 unsigned long excess = 0;
591
592 if (nr_pages > soft_limit)
593 excess = nr_pages - soft_limit;
594
595 return excess;
596 }
597
598 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
599 {
600 unsigned long excess;
601 struct mem_cgroup_per_node *mz;
602 struct mem_cgroup_tree_per_node *mctz;
603
604 mctz = soft_limit_tree_from_page(page);
605 if (!mctz)
606 return;
607
608
609
610
611 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
612 mz = mem_cgroup_page_nodeinfo(memcg, page);
613 excess = soft_limit_excess(memcg);
614
615
616
617
618 if (excess || mz->on_tree) {
619 unsigned long flags;
620
621 spin_lock_irqsave(&mctz->lock, flags);
622
623 if (mz->on_tree)
624 __mem_cgroup_remove_exceeded(mz, mctz);
625
626
627
628
629 __mem_cgroup_insert_exceeded(mz, mctz, excess);
630 spin_unlock_irqrestore(&mctz->lock, flags);
631 }
632 }
633 }
634
635 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
636 {
637 struct mem_cgroup_tree_per_node *mctz;
638 struct mem_cgroup_per_node *mz;
639 int nid;
640
641 for_each_node(nid) {
642 mz = mem_cgroup_nodeinfo(memcg, nid);
643 mctz = soft_limit_tree_node(nid);
644 if (mctz)
645 mem_cgroup_remove_exceeded(mz, mctz);
646 }
647 }
648
649 static struct mem_cgroup_per_node *
650 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
651 {
652 struct mem_cgroup_per_node *mz;
653
654 retry:
655 mz = NULL;
656 if (!mctz->rb_rightmost)
657 goto done;
658
659 mz = rb_entry(mctz->rb_rightmost,
660 struct mem_cgroup_per_node, tree_node);
661
662
663
664
665
666 __mem_cgroup_remove_exceeded(mz, mctz);
667 if (!soft_limit_excess(mz->memcg) ||
668 !css_tryget_online(&mz->memcg->css))
669 goto retry;
670 done:
671 return mz;
672 }
673
674 static struct mem_cgroup_per_node *
675 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
676 {
677 struct mem_cgroup_per_node *mz;
678
679 spin_lock_irq(&mctz->lock);
680 mz = __mem_cgroup_largest_soft_limit_node(mctz);
681 spin_unlock_irq(&mctz->lock);
682 return mz;
683 }
684
685
686
687
688
689
690
691 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
692 {
693 long x;
694
695 if (mem_cgroup_disabled())
696 return;
697
698 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
699 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
700 struct mem_cgroup *mi;
701
702
703
704
705
706 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
707 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
708 atomic_long_add(x, &mi->vmstats[idx]);
709 x = 0;
710 }
711 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
712 }
713
714 static struct mem_cgroup_per_node *
715 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
716 {
717 struct mem_cgroup *parent;
718
719 parent = parent_mem_cgroup(pn->memcg);
720 if (!parent)
721 return NULL;
722 return mem_cgroup_nodeinfo(parent, nid);
723 }
724
725
726
727
728
729
730
731
732
733
734
735 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
736 int val)
737 {
738 pg_data_t *pgdat = lruvec_pgdat(lruvec);
739 struct mem_cgroup_per_node *pn;
740 struct mem_cgroup *memcg;
741 long x;
742
743
744 __mod_node_page_state(pgdat, idx, val);
745
746 if (mem_cgroup_disabled())
747 return;
748
749 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
750 memcg = pn->memcg;
751
752
753 __mod_memcg_state(memcg, idx, val);
754
755
756 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
757
758 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
759 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
760 struct mem_cgroup_per_node *pi;
761
762 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
763 atomic_long_add(x, &pi->lruvec_stat[idx]);
764 x = 0;
765 }
766 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
767 }
768
769 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
770 {
771 struct page *page = virt_to_head_page(p);
772 pg_data_t *pgdat = page_pgdat(page);
773 struct mem_cgroup *memcg;
774 struct lruvec *lruvec;
775
776 rcu_read_lock();
777 memcg = memcg_from_slab_page(page);
778
779
780 if (!memcg || memcg == root_mem_cgroup) {
781 __mod_node_page_state(pgdat, idx, val);
782 } else {
783 lruvec = mem_cgroup_lruvec(pgdat, memcg);
784 __mod_lruvec_state(lruvec, idx, val);
785 }
786 rcu_read_unlock();
787 }
788
789 void mod_memcg_obj_state(void *p, int idx, int val)
790 {
791 struct mem_cgroup *memcg;
792
793 rcu_read_lock();
794 memcg = mem_cgroup_from_obj(p);
795 if (memcg)
796 mod_memcg_state(memcg, idx, val);
797 rcu_read_unlock();
798 }
799
800
801
802
803
804
805
806 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
807 unsigned long count)
808 {
809 unsigned long x;
810
811 if (mem_cgroup_disabled())
812 return;
813
814 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
815 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
816 struct mem_cgroup *mi;
817
818
819
820
821
822 __this_cpu_add(memcg->vmstats_local->events[idx], x);
823 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
824 atomic_long_add(x, &mi->vmevents[idx]);
825 x = 0;
826 }
827 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
828 }
829
830 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
831 {
832 return atomic_long_read(&memcg->vmevents[event]);
833 }
834
835 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
836 {
837 long x = 0;
838 int cpu;
839
840 for_each_possible_cpu(cpu)
841 x += per_cpu(memcg->vmstats_local->events[event], cpu);
842 return x;
843 }
844
845 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
846 struct page *page,
847 bool compound, int nr_pages)
848 {
849
850
851
852
853 if (PageAnon(page))
854 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
855 else {
856 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
857 if (PageSwapBacked(page))
858 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
859 }
860
861 if (compound) {
862 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
863 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
864 }
865
866
867 if (nr_pages > 0)
868 __count_memcg_events(memcg, PGPGIN, 1);
869 else {
870 __count_memcg_events(memcg, PGPGOUT, 1);
871 nr_pages = -nr_pages;
872 }
873
874 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
875 }
876
877 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
878 enum mem_cgroup_events_target target)
879 {
880 unsigned long val, next;
881
882 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
883 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
884
885 if ((long)(next - val) < 0) {
886 switch (target) {
887 case MEM_CGROUP_TARGET_THRESH:
888 next = val + THRESHOLDS_EVENTS_TARGET;
889 break;
890 case MEM_CGROUP_TARGET_SOFTLIMIT:
891 next = val + SOFTLIMIT_EVENTS_TARGET;
892 break;
893 case MEM_CGROUP_TARGET_NUMAINFO:
894 next = val + NUMAINFO_EVENTS_TARGET;
895 break;
896 default:
897 break;
898 }
899 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
900 return true;
901 }
902 return false;
903 }
904
905
906
907
908
909 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
910 {
911
912 if (unlikely(mem_cgroup_event_ratelimit(memcg,
913 MEM_CGROUP_TARGET_THRESH))) {
914 bool do_softlimit;
915 bool do_numainfo __maybe_unused;
916
917 do_softlimit = mem_cgroup_event_ratelimit(memcg,
918 MEM_CGROUP_TARGET_SOFTLIMIT);
919 #if MAX_NUMNODES > 1
920 do_numainfo = mem_cgroup_event_ratelimit(memcg,
921 MEM_CGROUP_TARGET_NUMAINFO);
922 #endif
923 mem_cgroup_threshold(memcg);
924 if (unlikely(do_softlimit))
925 mem_cgroup_update_tree(memcg, page);
926 #if MAX_NUMNODES > 1
927 if (unlikely(do_numainfo))
928 atomic_inc(&memcg->numainfo_events);
929 #endif
930 }
931 }
932
933 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
934 {
935
936
937
938
939
940 if (unlikely(!p))
941 return NULL;
942
943 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
944 }
945 EXPORT_SYMBOL(mem_cgroup_from_task);
946
947
948
949
950
951
952
953
954
955 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
956 {
957 struct mem_cgroup *memcg;
958
959 if (mem_cgroup_disabled())
960 return NULL;
961
962 rcu_read_lock();
963 do {
964
965
966
967
968
969 if (unlikely(!mm))
970 memcg = root_mem_cgroup;
971 else {
972 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
973 if (unlikely(!memcg))
974 memcg = root_mem_cgroup;
975 }
976 } while (!css_tryget(&memcg->css));
977 rcu_read_unlock();
978 return memcg;
979 }
980 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
981
982
983
984
985
986
987
988
989 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
990 {
991 struct mem_cgroup *memcg = page->mem_cgroup;
992
993 if (mem_cgroup_disabled())
994 return NULL;
995
996 rcu_read_lock();
997 if (!memcg || !css_tryget_online(&memcg->css))
998 memcg = root_mem_cgroup;
999 rcu_read_unlock();
1000 return memcg;
1001 }
1002 EXPORT_SYMBOL(get_mem_cgroup_from_page);
1003
1004
1005
1006
1007 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
1008 {
1009 if (unlikely(current->active_memcg)) {
1010 struct mem_cgroup *memcg = root_mem_cgroup;
1011
1012 rcu_read_lock();
1013 if (css_tryget_online(¤t->active_memcg->css))
1014 memcg = current->active_memcg;
1015 rcu_read_unlock();
1016 return memcg;
1017 }
1018 return get_mem_cgroup_from_mm(current->mm);
1019 }
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1039 struct mem_cgroup *prev,
1040 struct mem_cgroup_reclaim_cookie *reclaim)
1041 {
1042 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1043 struct cgroup_subsys_state *css = NULL;
1044 struct mem_cgroup *memcg = NULL;
1045 struct mem_cgroup *pos = NULL;
1046
1047 if (mem_cgroup_disabled())
1048 return NULL;
1049
1050 if (!root)
1051 root = root_mem_cgroup;
1052
1053 if (prev && !reclaim)
1054 pos = prev;
1055
1056 if (!root->use_hierarchy && root != root_mem_cgroup) {
1057 if (prev)
1058 goto out;
1059 return root;
1060 }
1061
1062 rcu_read_lock();
1063
1064 if (reclaim) {
1065 struct mem_cgroup_per_node *mz;
1066
1067 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1068 iter = &mz->iter[reclaim->priority];
1069
1070 if (prev && reclaim->generation != iter->generation)
1071 goto out_unlock;
1072
1073 while (1) {
1074 pos = READ_ONCE(iter->position);
1075 if (!pos || css_tryget(&pos->css))
1076 break;
1077
1078
1079
1080
1081
1082
1083
1084
1085 (void)cmpxchg(&iter->position, pos, NULL);
1086 }
1087 }
1088
1089 if (pos)
1090 css = &pos->css;
1091
1092 for (;;) {
1093 css = css_next_descendant_pre(css, &root->css);
1094 if (!css) {
1095
1096
1097
1098
1099
1100
1101 if (!prev)
1102 continue;
1103 break;
1104 }
1105
1106
1107
1108
1109
1110
1111 memcg = mem_cgroup_from_css(css);
1112
1113 if (css == &root->css)
1114 break;
1115
1116 if (css_tryget(css))
1117 break;
1118
1119 memcg = NULL;
1120 }
1121
1122 if (reclaim) {
1123
1124
1125
1126
1127
1128 (void)cmpxchg(&iter->position, pos, memcg);
1129
1130 if (pos)
1131 css_put(&pos->css);
1132
1133 if (!memcg)
1134 iter->generation++;
1135 else if (!prev)
1136 reclaim->generation = iter->generation;
1137 }
1138
1139 out_unlock:
1140 rcu_read_unlock();
1141 out:
1142 if (prev && prev != root)
1143 css_put(&prev->css);
1144
1145 return memcg;
1146 }
1147
1148
1149
1150
1151
1152
1153 void mem_cgroup_iter_break(struct mem_cgroup *root,
1154 struct mem_cgroup *prev)
1155 {
1156 if (!root)
1157 root = root_mem_cgroup;
1158 if (prev && prev != root)
1159 css_put(&prev->css);
1160 }
1161
1162 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1163 struct mem_cgroup *dead_memcg)
1164 {
1165 struct mem_cgroup_reclaim_iter *iter;
1166 struct mem_cgroup_per_node *mz;
1167 int nid;
1168 int i;
1169
1170 for_each_node(nid) {
1171 mz = mem_cgroup_nodeinfo(from, nid);
1172 for (i = 0; i <= DEF_PRIORITY; i++) {
1173 iter = &mz->iter[i];
1174 cmpxchg(&iter->position,
1175 dead_memcg, NULL);
1176 }
1177 }
1178 }
1179
1180 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1181 {
1182 struct mem_cgroup *memcg = dead_memcg;
1183 struct mem_cgroup *last;
1184
1185 do {
1186 __invalidate_reclaim_iterators(memcg, dead_memcg);
1187 last = memcg;
1188 } while ((memcg = parent_mem_cgroup(memcg)));
1189
1190
1191
1192
1193
1194
1195
1196 if (last != root_mem_cgroup)
1197 __invalidate_reclaim_iterators(root_mem_cgroup,
1198 dead_memcg);
1199 }
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1215 int (*fn)(struct task_struct *, void *), void *arg)
1216 {
1217 struct mem_cgroup *iter;
1218 int ret = 0;
1219
1220 BUG_ON(memcg == root_mem_cgroup);
1221
1222 for_each_mem_cgroup_tree(iter, memcg) {
1223 struct css_task_iter it;
1224 struct task_struct *task;
1225
1226 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1227 while (!ret && (task = css_task_iter_next(&it)))
1228 ret = fn(task, arg);
1229 css_task_iter_end(&it);
1230 if (ret) {
1231 mem_cgroup_iter_break(memcg, iter);
1232 break;
1233 }
1234 }
1235 return ret;
1236 }
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1248 {
1249 struct mem_cgroup_per_node *mz;
1250 struct mem_cgroup *memcg;
1251 struct lruvec *lruvec;
1252
1253 if (mem_cgroup_disabled()) {
1254 lruvec = &pgdat->lruvec;
1255 goto out;
1256 }
1257
1258 memcg = page->mem_cgroup;
1259
1260
1261
1262
1263 if (!memcg)
1264 memcg = root_mem_cgroup;
1265
1266 mz = mem_cgroup_page_nodeinfo(memcg, page);
1267 lruvec = &mz->lruvec;
1268 out:
1269
1270
1271
1272
1273
1274 if (unlikely(lruvec->pgdat != pgdat))
1275 lruvec->pgdat = pgdat;
1276 return lruvec;
1277 }
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1291 int zid, int nr_pages)
1292 {
1293 struct mem_cgroup_per_node *mz;
1294 unsigned long *lru_size;
1295 long size;
1296
1297 if (mem_cgroup_disabled())
1298 return;
1299
1300 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1301 lru_size = &mz->lru_zone_size[zid][lru];
1302
1303 if (nr_pages < 0)
1304 *lru_size += nr_pages;
1305
1306 size = *lru_size;
1307 if (WARN_ONCE(size < 0,
1308 "%s(%p, %d, %d): lru_size %ld\n",
1309 __func__, lruvec, lru, nr_pages, size)) {
1310 VM_BUG_ON(1);
1311 *lru_size = 0;
1312 }
1313
1314 if (nr_pages > 0)
1315 *lru_size += nr_pages;
1316 }
1317
1318
1319
1320
1321
1322
1323
1324
1325 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1326 {
1327 unsigned long margin = 0;
1328 unsigned long count;
1329 unsigned long limit;
1330
1331 count = page_counter_read(&memcg->memory);
1332 limit = READ_ONCE(memcg->memory.max);
1333 if (count < limit)
1334 margin = limit - count;
1335
1336 if (do_memsw_account()) {
1337 count = page_counter_read(&memcg->memsw);
1338 limit = READ_ONCE(memcg->memsw.max);
1339 if (count <= limit)
1340 margin = min(margin, limit - count);
1341 else
1342 margin = 0;
1343 }
1344
1345 return margin;
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1356 {
1357 struct mem_cgroup *from;
1358 struct mem_cgroup *to;
1359 bool ret = false;
1360
1361
1362
1363
1364 spin_lock(&mc.lock);
1365 from = mc.from;
1366 to = mc.to;
1367 if (!from)
1368 goto unlock;
1369
1370 ret = mem_cgroup_is_descendant(from, memcg) ||
1371 mem_cgroup_is_descendant(to, memcg);
1372 unlock:
1373 spin_unlock(&mc.lock);
1374 return ret;
1375 }
1376
1377 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1378 {
1379 if (mc.moving_task && current != mc.moving_task) {
1380 if (mem_cgroup_under_move(memcg)) {
1381 DEFINE_WAIT(wait);
1382 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1383
1384 if (mc.moving_task)
1385 schedule();
1386 finish_wait(&mc.waitq, &wait);
1387 return true;
1388 }
1389 }
1390 return false;
1391 }
1392
1393 static char *memory_stat_format(struct mem_cgroup *memcg)
1394 {
1395 struct seq_buf s;
1396 int i;
1397
1398 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1399 if (!s.buffer)
1400 return NULL;
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413 seq_buf_printf(&s, "anon %llu\n",
1414 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1415 PAGE_SIZE);
1416 seq_buf_printf(&s, "file %llu\n",
1417 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1418 PAGE_SIZE);
1419 seq_buf_printf(&s, "kernel_stack %llu\n",
1420 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1421 1024);
1422 seq_buf_printf(&s, "slab %llu\n",
1423 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1424 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1425 PAGE_SIZE);
1426 seq_buf_printf(&s, "sock %llu\n",
1427 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1428 PAGE_SIZE);
1429
1430 seq_buf_printf(&s, "shmem %llu\n",
1431 (u64)memcg_page_state(memcg, NR_SHMEM) *
1432 PAGE_SIZE);
1433 seq_buf_printf(&s, "file_mapped %llu\n",
1434 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1435 PAGE_SIZE);
1436 seq_buf_printf(&s, "file_dirty %llu\n",
1437 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1438 PAGE_SIZE);
1439 seq_buf_printf(&s, "file_writeback %llu\n",
1440 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1441 PAGE_SIZE);
1442
1443
1444
1445
1446
1447
1448
1449 seq_buf_printf(&s, "anon_thp %llu\n",
1450 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1451 PAGE_SIZE);
1452
1453 for (i = 0; i < NR_LRU_LISTS; i++)
1454 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
1455 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1456 PAGE_SIZE);
1457
1458 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1459 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1460 PAGE_SIZE);
1461 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1462 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1463 PAGE_SIZE);
1464
1465
1466
1467 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
1468 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
1469
1470 seq_buf_printf(&s, "workingset_refault %lu\n",
1471 memcg_page_state(memcg, WORKINGSET_REFAULT));
1472 seq_buf_printf(&s, "workingset_activate %lu\n",
1473 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1474 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1475 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1476
1477 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
1478 seq_buf_printf(&s, "pgscan %lu\n",
1479 memcg_events(memcg, PGSCAN_KSWAPD) +
1480 memcg_events(memcg, PGSCAN_DIRECT));
1481 seq_buf_printf(&s, "pgsteal %lu\n",
1482 memcg_events(memcg, PGSTEAL_KSWAPD) +
1483 memcg_events(memcg, PGSTEAL_DIRECT));
1484 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
1485 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
1486 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
1487 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
1488
1489 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1490 seq_buf_printf(&s, "thp_fault_alloc %lu\n",
1491 memcg_events(memcg, THP_FAULT_ALLOC));
1492 seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
1493 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1494 #endif
1495
1496
1497 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1498
1499 return s.buffer;
1500 }
1501
1502 #define K(x) ((x) << (PAGE_SHIFT-10))
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1513 {
1514 rcu_read_lock();
1515
1516 if (memcg) {
1517 pr_cont(",oom_memcg=");
1518 pr_cont_cgroup_path(memcg->css.cgroup);
1519 } else
1520 pr_cont(",global_oom");
1521 if (p) {
1522 pr_cont(",task_memcg=");
1523 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1524 }
1525 rcu_read_unlock();
1526 }
1527
1528
1529
1530
1531
1532
1533 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1534 {
1535 char *buf;
1536
1537 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1538 K((u64)page_counter_read(&memcg->memory)),
1539 K((u64)memcg->memory.max), memcg->memory.failcnt);
1540 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1541 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1542 K((u64)page_counter_read(&memcg->swap)),
1543 K((u64)memcg->swap.max), memcg->swap.failcnt);
1544 else {
1545 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1546 K((u64)page_counter_read(&memcg->memsw)),
1547 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1548 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1549 K((u64)page_counter_read(&memcg->kmem)),
1550 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1551 }
1552
1553 pr_info("Memory cgroup stats for ");
1554 pr_cont_cgroup_path(memcg->css.cgroup);
1555 pr_cont(":");
1556 buf = memory_stat_format(memcg);
1557 if (!buf)
1558 return;
1559 pr_info("%s", buf);
1560 kfree(buf);
1561 }
1562
1563
1564
1565
1566 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1567 {
1568 unsigned long max;
1569
1570 max = memcg->memory.max;
1571 if (mem_cgroup_swappiness(memcg)) {
1572 unsigned long memsw_max;
1573 unsigned long swap_max;
1574
1575 memsw_max = memcg->memsw.max;
1576 swap_max = memcg->swap.max;
1577 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1578 max = min(max + swap_max, memsw_max);
1579 }
1580 return max;
1581 }
1582
1583 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1584 {
1585 return page_counter_read(&memcg->memory);
1586 }
1587
1588 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1589 int order)
1590 {
1591 struct oom_control oc = {
1592 .zonelist = NULL,
1593 .nodemask = NULL,
1594 .memcg = memcg,
1595 .gfp_mask = gfp_mask,
1596 .order = order,
1597 };
1598 bool ret;
1599
1600 if (mutex_lock_killable(&oom_lock))
1601 return true;
1602
1603
1604
1605
1606 ret = should_force_charge() || out_of_memory(&oc);
1607 mutex_unlock(&oom_lock);
1608 return ret;
1609 }
1610
1611 #if MAX_NUMNODES > 1
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1624 int nid, bool noswap)
1625 {
1626 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
1627
1628 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
1629 lruvec_page_state(lruvec, NR_ACTIVE_FILE))
1630 return true;
1631 if (noswap || !total_swap_pages)
1632 return false;
1633 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
1634 lruvec_page_state(lruvec, NR_ACTIVE_ANON))
1635 return true;
1636 return false;
1637
1638 }
1639
1640
1641
1642
1643
1644
1645
1646 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1647 {
1648 int nid;
1649
1650
1651
1652
1653 if (!atomic_read(&memcg->numainfo_events))
1654 return;
1655 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1656 return;
1657
1658
1659 memcg->scan_nodes = node_states[N_MEMORY];
1660
1661 for_each_node_mask(nid, node_states[N_MEMORY]) {
1662
1663 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1664 node_clear(nid, memcg->scan_nodes);
1665 }
1666
1667 atomic_set(&memcg->numainfo_events, 0);
1668 atomic_set(&memcg->numainfo_updating, 0);
1669 }
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1684 {
1685 int node;
1686
1687 mem_cgroup_may_update_nodemask(memcg);
1688 node = memcg->last_scanned_node;
1689
1690 node = next_node_in(node, memcg->scan_nodes);
1691
1692
1693
1694
1695
1696 if (unlikely(node == MAX_NUMNODES))
1697 node = numa_node_id();
1698
1699 memcg->last_scanned_node = node;
1700 return node;
1701 }
1702 #else
1703 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1704 {
1705 return 0;
1706 }
1707 #endif
1708
1709 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1710 pg_data_t *pgdat,
1711 gfp_t gfp_mask,
1712 unsigned long *total_scanned)
1713 {
1714 struct mem_cgroup *victim = NULL;
1715 int total = 0;
1716 int loop = 0;
1717 unsigned long excess;
1718 unsigned long nr_scanned;
1719 struct mem_cgroup_reclaim_cookie reclaim = {
1720 .pgdat = pgdat,
1721 .priority = 0,
1722 };
1723
1724 excess = soft_limit_excess(root_memcg);
1725
1726 while (1) {
1727 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1728 if (!victim) {
1729 loop++;
1730 if (loop >= 2) {
1731
1732
1733
1734
1735
1736 if (!total)
1737 break;
1738
1739
1740
1741
1742
1743
1744 if (total >= (excess >> 2) ||
1745 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1746 break;
1747 }
1748 continue;
1749 }
1750 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1751 pgdat, &nr_scanned);
1752 *total_scanned += nr_scanned;
1753 if (!soft_limit_excess(root_memcg))
1754 break;
1755 }
1756 mem_cgroup_iter_break(root_memcg, victim);
1757 return total;
1758 }
1759
1760 #ifdef CONFIG_LOCKDEP
1761 static struct lockdep_map memcg_oom_lock_dep_map = {
1762 .name = "memcg_oom_lock",
1763 };
1764 #endif
1765
1766 static DEFINE_SPINLOCK(memcg_oom_lock);
1767
1768
1769
1770
1771
1772 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1773 {
1774 struct mem_cgroup *iter, *failed = NULL;
1775
1776 spin_lock(&memcg_oom_lock);
1777
1778 for_each_mem_cgroup_tree(iter, memcg) {
1779 if (iter->oom_lock) {
1780
1781
1782
1783
1784 failed = iter;
1785 mem_cgroup_iter_break(memcg, iter);
1786 break;
1787 } else
1788 iter->oom_lock = true;
1789 }
1790
1791 if (failed) {
1792
1793
1794
1795
1796 for_each_mem_cgroup_tree(iter, memcg) {
1797 if (iter == failed) {
1798 mem_cgroup_iter_break(memcg, iter);
1799 break;
1800 }
1801 iter->oom_lock = false;
1802 }
1803 } else
1804 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1805
1806 spin_unlock(&memcg_oom_lock);
1807
1808 return !failed;
1809 }
1810
1811 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1812 {
1813 struct mem_cgroup *iter;
1814
1815 spin_lock(&memcg_oom_lock);
1816 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1817 for_each_mem_cgroup_tree(iter, memcg)
1818 iter->oom_lock = false;
1819 spin_unlock(&memcg_oom_lock);
1820 }
1821
1822 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1823 {
1824 struct mem_cgroup *iter;
1825
1826 spin_lock(&memcg_oom_lock);
1827 for_each_mem_cgroup_tree(iter, memcg)
1828 iter->under_oom++;
1829 spin_unlock(&memcg_oom_lock);
1830 }
1831
1832 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1833 {
1834 struct mem_cgroup *iter;
1835
1836
1837
1838
1839
1840 spin_lock(&memcg_oom_lock);
1841 for_each_mem_cgroup_tree(iter, memcg)
1842 if (iter->under_oom > 0)
1843 iter->under_oom--;
1844 spin_unlock(&memcg_oom_lock);
1845 }
1846
1847 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1848
1849 struct oom_wait_info {
1850 struct mem_cgroup *memcg;
1851 wait_queue_entry_t wait;
1852 };
1853
1854 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1855 unsigned mode, int sync, void *arg)
1856 {
1857 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1858 struct mem_cgroup *oom_wait_memcg;
1859 struct oom_wait_info *oom_wait_info;
1860
1861 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1862 oom_wait_memcg = oom_wait_info->memcg;
1863
1864 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1865 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1866 return 0;
1867 return autoremove_wake_function(wait, mode, sync, arg);
1868 }
1869
1870 static void memcg_oom_recover(struct mem_cgroup *memcg)
1871 {
1872
1873
1874
1875
1876
1877
1878
1879
1880 if (memcg && memcg->under_oom)
1881 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1882 }
1883
1884 enum oom_status {
1885 OOM_SUCCESS,
1886 OOM_FAILED,
1887 OOM_ASYNC,
1888 OOM_SKIPPED
1889 };
1890
1891 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1892 {
1893 enum oom_status ret;
1894 bool locked;
1895
1896 if (order > PAGE_ALLOC_COSTLY_ORDER)
1897 return OOM_SKIPPED;
1898
1899 memcg_memory_event(memcg, MEMCG_OOM);
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919 if (memcg->oom_kill_disable) {
1920 if (!current->in_user_fault)
1921 return OOM_SKIPPED;
1922 css_get(&memcg->css);
1923 current->memcg_in_oom = memcg;
1924 current->memcg_oom_gfp_mask = mask;
1925 current->memcg_oom_order = order;
1926
1927 return OOM_ASYNC;
1928 }
1929
1930 mem_cgroup_mark_under_oom(memcg);
1931
1932 locked = mem_cgroup_oom_trylock(memcg);
1933
1934 if (locked)
1935 mem_cgroup_oom_notify(memcg);
1936
1937 mem_cgroup_unmark_under_oom(memcg);
1938 if (mem_cgroup_out_of_memory(memcg, mask, order))
1939 ret = OOM_SUCCESS;
1940 else
1941 ret = OOM_FAILED;
1942
1943 if (locked)
1944 mem_cgroup_oom_unlock(memcg);
1945
1946 return ret;
1947 }
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966 bool mem_cgroup_oom_synchronize(bool handle)
1967 {
1968 struct mem_cgroup *memcg = current->memcg_in_oom;
1969 struct oom_wait_info owait;
1970 bool locked;
1971
1972
1973 if (!memcg)
1974 return false;
1975
1976 if (!handle)
1977 goto cleanup;
1978
1979 owait.memcg = memcg;
1980 owait.wait.flags = 0;
1981 owait.wait.func = memcg_oom_wake_function;
1982 owait.wait.private = current;
1983 INIT_LIST_HEAD(&owait.wait.entry);
1984
1985 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1986 mem_cgroup_mark_under_oom(memcg);
1987
1988 locked = mem_cgroup_oom_trylock(memcg);
1989
1990 if (locked)
1991 mem_cgroup_oom_notify(memcg);
1992
1993 if (locked && !memcg->oom_kill_disable) {
1994 mem_cgroup_unmark_under_oom(memcg);
1995 finish_wait(&memcg_oom_waitq, &owait.wait);
1996 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1997 current->memcg_oom_order);
1998 } else {
1999 schedule();
2000 mem_cgroup_unmark_under_oom(memcg);
2001 finish_wait(&memcg_oom_waitq, &owait.wait);
2002 }
2003
2004 if (locked) {
2005 mem_cgroup_oom_unlock(memcg);
2006
2007
2008
2009
2010
2011 memcg_oom_recover(memcg);
2012 }
2013 cleanup:
2014 current->memcg_in_oom = NULL;
2015 css_put(&memcg->css);
2016 return true;
2017 }
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2030 struct mem_cgroup *oom_domain)
2031 {
2032 struct mem_cgroup *oom_group = NULL;
2033 struct mem_cgroup *memcg;
2034
2035 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2036 return NULL;
2037
2038 if (!oom_domain)
2039 oom_domain = root_mem_cgroup;
2040
2041 rcu_read_lock();
2042
2043 memcg = mem_cgroup_from_task(victim);
2044 if (memcg == root_mem_cgroup)
2045 goto out;
2046
2047
2048
2049
2050
2051
2052 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2053 if (memcg->oom_group)
2054 oom_group = memcg;
2055
2056 if (memcg == oom_domain)
2057 break;
2058 }
2059
2060 if (oom_group)
2061 css_get(&oom_group->css);
2062 out:
2063 rcu_read_unlock();
2064
2065 return oom_group;
2066 }
2067
2068 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2069 {
2070 pr_info("Tasks in ");
2071 pr_cont_cgroup_path(memcg->css.cgroup);
2072 pr_cont(" are going to be killed due to memory.oom.group set\n");
2073 }
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086 struct mem_cgroup *lock_page_memcg(struct page *page)
2087 {
2088 struct mem_cgroup *memcg;
2089 unsigned long flags;
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102 rcu_read_lock();
2103
2104 if (mem_cgroup_disabled())
2105 return NULL;
2106 again:
2107 memcg = page->mem_cgroup;
2108 if (unlikely(!memcg))
2109 return NULL;
2110
2111 if (atomic_read(&memcg->moving_account) <= 0)
2112 return memcg;
2113
2114 spin_lock_irqsave(&memcg->move_lock, flags);
2115 if (memcg != page->mem_cgroup) {
2116 spin_unlock_irqrestore(&memcg->move_lock, flags);
2117 goto again;
2118 }
2119
2120
2121
2122
2123
2124
2125 memcg->move_lock_task = current;
2126 memcg->move_lock_flags = flags;
2127
2128 return memcg;
2129 }
2130 EXPORT_SYMBOL(lock_page_memcg);
2131
2132
2133
2134
2135
2136
2137
2138 void __unlock_page_memcg(struct mem_cgroup *memcg)
2139 {
2140 if (memcg && memcg->move_lock_task == current) {
2141 unsigned long flags = memcg->move_lock_flags;
2142
2143 memcg->move_lock_task = NULL;
2144 memcg->move_lock_flags = 0;
2145
2146 spin_unlock_irqrestore(&memcg->move_lock, flags);
2147 }
2148
2149 rcu_read_unlock();
2150 }
2151
2152
2153
2154
2155
2156 void unlock_page_memcg(struct page *page)
2157 {
2158 __unlock_page_memcg(page->mem_cgroup);
2159 }
2160 EXPORT_SYMBOL(unlock_page_memcg);
2161
2162 struct memcg_stock_pcp {
2163 struct mem_cgroup *cached;
2164 unsigned int nr_pages;
2165 struct work_struct work;
2166 unsigned long flags;
2167 #define FLUSHING_CACHED_CHARGE 0
2168 };
2169 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2170 static DEFINE_MUTEX(percpu_charge_mutex);
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2184 {
2185 struct memcg_stock_pcp *stock;
2186 unsigned long flags;
2187 bool ret = false;
2188
2189 if (nr_pages > MEMCG_CHARGE_BATCH)
2190 return ret;
2191
2192 local_irq_save(flags);
2193
2194 stock = this_cpu_ptr(&memcg_stock);
2195 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2196 stock->nr_pages -= nr_pages;
2197 ret = true;
2198 }
2199
2200 local_irq_restore(flags);
2201
2202 return ret;
2203 }
2204
2205
2206
2207
2208 static void drain_stock(struct memcg_stock_pcp *stock)
2209 {
2210 struct mem_cgroup *old = stock->cached;
2211
2212 if (stock->nr_pages) {
2213 page_counter_uncharge(&old->memory, stock->nr_pages);
2214 if (do_memsw_account())
2215 page_counter_uncharge(&old->memsw, stock->nr_pages);
2216 css_put_many(&old->css, stock->nr_pages);
2217 stock->nr_pages = 0;
2218 }
2219 stock->cached = NULL;
2220 }
2221
2222 static void drain_local_stock(struct work_struct *dummy)
2223 {
2224 struct memcg_stock_pcp *stock;
2225 unsigned long flags;
2226
2227
2228
2229
2230
2231 local_irq_save(flags);
2232
2233 stock = this_cpu_ptr(&memcg_stock);
2234 drain_stock(stock);
2235 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2236
2237 local_irq_restore(flags);
2238 }
2239
2240
2241
2242
2243
2244 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2245 {
2246 struct memcg_stock_pcp *stock;
2247 unsigned long flags;
2248
2249 local_irq_save(flags);
2250
2251 stock = this_cpu_ptr(&memcg_stock);
2252 if (stock->cached != memcg) {
2253 drain_stock(stock);
2254 stock->cached = memcg;
2255 }
2256 stock->nr_pages += nr_pages;
2257
2258 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2259 drain_stock(stock);
2260
2261 local_irq_restore(flags);
2262 }
2263
2264
2265
2266
2267
2268 static void drain_all_stock(struct mem_cgroup *root_memcg)
2269 {
2270 int cpu, curcpu;
2271
2272
2273 if (!mutex_trylock(&percpu_charge_mutex))
2274 return;
2275
2276
2277
2278
2279
2280
2281 curcpu = get_cpu();
2282 for_each_online_cpu(cpu) {
2283 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2284 struct mem_cgroup *memcg;
2285 bool flush = false;
2286
2287 rcu_read_lock();
2288 memcg = stock->cached;
2289 if (memcg && stock->nr_pages &&
2290 mem_cgroup_is_descendant(memcg, root_memcg))
2291 flush = true;
2292 rcu_read_unlock();
2293
2294 if (flush &&
2295 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2296 if (cpu == curcpu)
2297 drain_local_stock(&stock->work);
2298 else
2299 schedule_work_on(cpu, &stock->work);
2300 }
2301 }
2302 put_cpu();
2303 mutex_unlock(&percpu_charge_mutex);
2304 }
2305
2306 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2307 {
2308 struct memcg_stock_pcp *stock;
2309 struct mem_cgroup *memcg, *mi;
2310
2311 stock = &per_cpu(memcg_stock, cpu);
2312 drain_stock(stock);
2313
2314 for_each_mem_cgroup(memcg) {
2315 int i;
2316
2317 for (i = 0; i < MEMCG_NR_STAT; i++) {
2318 int nid;
2319 long x;
2320
2321 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2322 if (x)
2323 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2324 atomic_long_add(x, &memcg->vmstats[i]);
2325
2326 if (i >= NR_VM_NODE_STAT_ITEMS)
2327 continue;
2328
2329 for_each_node(nid) {
2330 struct mem_cgroup_per_node *pn;
2331
2332 pn = mem_cgroup_nodeinfo(memcg, nid);
2333 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2334 if (x)
2335 do {
2336 atomic_long_add(x, &pn->lruvec_stat[i]);
2337 } while ((pn = parent_nodeinfo(pn, nid)));
2338 }
2339 }
2340
2341 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2342 long x;
2343
2344 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2345 if (x)
2346 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2347 atomic_long_add(x, &memcg->vmevents[i]);
2348 }
2349 }
2350
2351 return 0;
2352 }
2353
2354 static void reclaim_high(struct mem_cgroup *memcg,
2355 unsigned int nr_pages,
2356 gfp_t gfp_mask)
2357 {
2358 do {
2359 if (page_counter_read(&memcg->memory) <= memcg->high)
2360 continue;
2361 memcg_memory_event(memcg, MEMCG_HIGH);
2362 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2363 } while ((memcg = parent_mem_cgroup(memcg)));
2364 }
2365
2366 static void high_work_func(struct work_struct *work)
2367 {
2368 struct mem_cgroup *memcg;
2369
2370 memcg = container_of(work, struct mem_cgroup, high_work);
2371 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2372 }
2373
2374
2375
2376
2377
2378
2379 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424 #define MEMCG_DELAY_PRECISION_SHIFT 20
2425 #define MEMCG_DELAY_SCALING_SHIFT 14
2426
2427
2428
2429
2430
2431 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2432 unsigned int nr_pages)
2433 {
2434 unsigned long penalty_jiffies;
2435 u64 max_overage = 0;
2436
2437 do {
2438 unsigned long usage, high;
2439 u64 overage;
2440
2441 usage = page_counter_read(&memcg->memory);
2442 high = READ_ONCE(memcg->high);
2443
2444 if (usage <= high)
2445 continue;
2446
2447
2448
2449
2450
2451 high = max(high, 1UL);
2452
2453 overage = usage - high;
2454 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2455 overage = div64_u64(overage, high);
2456
2457 if (overage > max_overage)
2458 max_overage = overage;
2459 } while ((memcg = parent_mem_cgroup(memcg)) &&
2460 !mem_cgroup_is_root(memcg));
2461
2462 if (!max_overage)
2463 return 0;
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473 penalty_jiffies = max_overage * max_overage * HZ;
2474 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2475 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485 penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2486
2487
2488
2489
2490
2491
2492 return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2493 }
2494
2495
2496
2497
2498
2499 void mem_cgroup_handle_over_high(void)
2500 {
2501 unsigned long penalty_jiffies;
2502 unsigned long pflags;
2503 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2504 struct mem_cgroup *memcg;
2505
2506 if (likely(!nr_pages))
2507 return;
2508
2509 memcg = get_mem_cgroup_from_mm(current->mm);
2510 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2511 current->memcg_nr_pages_over_high = 0;
2512
2513
2514
2515
2516
2517 penalty_jiffies = calculate_high_delay(memcg, nr_pages);
2518
2519
2520
2521
2522
2523
2524
2525 if (penalty_jiffies <= HZ / 100)
2526 goto out;
2527
2528
2529
2530
2531
2532
2533 psi_memstall_enter(&pflags);
2534 schedule_timeout_killable(penalty_jiffies);
2535 psi_memstall_leave(&pflags);
2536
2537 out:
2538 css_put(&memcg->css);
2539 }
2540
2541 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2542 unsigned int nr_pages)
2543 {
2544 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2545 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2546 struct mem_cgroup *mem_over_limit;
2547 struct page_counter *counter;
2548 unsigned long nr_reclaimed;
2549 bool may_swap = true;
2550 bool drained = false;
2551 enum oom_status oom_status;
2552
2553 if (mem_cgroup_is_root(memcg))
2554 return 0;
2555 retry:
2556 if (consume_stock(memcg, nr_pages))
2557 return 0;
2558
2559 if (!do_memsw_account() ||
2560 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2561 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2562 goto done_restock;
2563 if (do_memsw_account())
2564 page_counter_uncharge(&memcg->memsw, batch);
2565 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2566 } else {
2567 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2568 may_swap = false;
2569 }
2570
2571 if (batch > nr_pages) {
2572 batch = nr_pages;
2573 goto retry;
2574 }
2575
2576
2577
2578
2579
2580
2581
2582 if (gfp_mask & __GFP_ATOMIC)
2583 goto force;
2584
2585
2586
2587
2588
2589
2590
2591 if (unlikely(should_force_charge()))
2592 goto force;
2593
2594
2595
2596
2597
2598
2599
2600 if (unlikely(current->flags & PF_MEMALLOC))
2601 goto force;
2602
2603 if (unlikely(task_in_memcg_oom(current)))
2604 goto nomem;
2605
2606 if (!gfpflags_allow_blocking(gfp_mask))
2607 goto nomem;
2608
2609 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2610
2611 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2612 gfp_mask, may_swap);
2613
2614 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2615 goto retry;
2616
2617 if (!drained) {
2618 drain_all_stock(mem_over_limit);
2619 drained = true;
2620 goto retry;
2621 }
2622
2623 if (gfp_mask & __GFP_NORETRY)
2624 goto nomem;
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2635 goto retry;
2636
2637
2638
2639
2640 if (mem_cgroup_wait_acct_move(mem_over_limit))
2641 goto retry;
2642
2643 if (nr_retries--)
2644 goto retry;
2645
2646 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2647 goto nomem;
2648
2649 if (gfp_mask & __GFP_NOFAIL)
2650 goto force;
2651
2652 if (fatal_signal_pending(current))
2653 goto force;
2654
2655
2656
2657
2658
2659
2660 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2661 get_order(nr_pages * PAGE_SIZE));
2662 switch (oom_status) {
2663 case OOM_SUCCESS:
2664 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2665 goto retry;
2666 case OOM_FAILED:
2667 goto force;
2668 default:
2669 goto nomem;
2670 }
2671 nomem:
2672 if (!(gfp_mask & __GFP_NOFAIL))
2673 return -ENOMEM;
2674 force:
2675
2676
2677
2678
2679
2680 page_counter_charge(&memcg->memory, nr_pages);
2681 if (do_memsw_account())
2682 page_counter_charge(&memcg->memsw, nr_pages);
2683 css_get_many(&memcg->css, nr_pages);
2684
2685 return 0;
2686
2687 done_restock:
2688 css_get_many(&memcg->css, batch);
2689 if (batch > nr_pages)
2690 refill_stock(memcg, batch - nr_pages);
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701 do {
2702 if (page_counter_read(&memcg->memory) > memcg->high) {
2703
2704 if (in_interrupt()) {
2705 schedule_work(&memcg->high_work);
2706 break;
2707 }
2708 current->memcg_nr_pages_over_high += batch;
2709 set_notify_resume(current);
2710 break;
2711 }
2712 } while ((memcg = parent_mem_cgroup(memcg)));
2713
2714 return 0;
2715 }
2716
2717 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2718 {
2719 if (mem_cgroup_is_root(memcg))
2720 return;
2721
2722 page_counter_uncharge(&memcg->memory, nr_pages);
2723 if (do_memsw_account())
2724 page_counter_uncharge(&memcg->memsw, nr_pages);
2725
2726 css_put_many(&memcg->css, nr_pages);
2727 }
2728
2729 static void lock_page_lru(struct page *page, int *isolated)
2730 {
2731 pg_data_t *pgdat = page_pgdat(page);
2732
2733 spin_lock_irq(&pgdat->lru_lock);
2734 if (PageLRU(page)) {
2735 struct lruvec *lruvec;
2736
2737 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2738 ClearPageLRU(page);
2739 del_page_from_lru_list(page, lruvec, page_lru(page));
2740 *isolated = 1;
2741 } else
2742 *isolated = 0;
2743 }
2744
2745 static void unlock_page_lru(struct page *page, int isolated)
2746 {
2747 pg_data_t *pgdat = page_pgdat(page);
2748
2749 if (isolated) {
2750 struct lruvec *lruvec;
2751
2752 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2753 VM_BUG_ON_PAGE(PageLRU(page), page);
2754 SetPageLRU(page);
2755 add_page_to_lru_list(page, lruvec, page_lru(page));
2756 }
2757 spin_unlock_irq(&pgdat->lru_lock);
2758 }
2759
2760 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2761 bool lrucare)
2762 {
2763 int isolated;
2764
2765 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2766
2767
2768
2769
2770
2771 if (lrucare)
2772 lock_page_lru(page, &isolated);
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788 page->mem_cgroup = memcg;
2789
2790 if (lrucare)
2791 unlock_page_lru(page, isolated);
2792 }
2793
2794 #ifdef CONFIG_MEMCG_KMEM
2795
2796
2797
2798
2799
2800
2801 struct mem_cgroup *mem_cgroup_from_obj(void *p)
2802 {
2803 struct page *page;
2804
2805 if (mem_cgroup_disabled())
2806 return NULL;
2807
2808 page = virt_to_head_page(p);
2809
2810
2811
2812
2813
2814
2815 if (PageSlab(page))
2816 return memcg_from_slab_page(page);
2817
2818
2819 return page->mem_cgroup;
2820 }
2821
2822 static int memcg_alloc_cache_id(void)
2823 {
2824 int id, size;
2825 int err;
2826
2827 id = ida_simple_get(&memcg_cache_ida,
2828 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2829 if (id < 0)
2830 return id;
2831
2832 if (id < memcg_nr_cache_ids)
2833 return id;
2834
2835
2836
2837
2838
2839 down_write(&memcg_cache_ids_sem);
2840
2841 size = 2 * (id + 1);
2842 if (size < MEMCG_CACHES_MIN_SIZE)
2843 size = MEMCG_CACHES_MIN_SIZE;
2844 else if (size > MEMCG_CACHES_MAX_SIZE)
2845 size = MEMCG_CACHES_MAX_SIZE;
2846
2847 err = memcg_update_all_caches(size);
2848 if (!err)
2849 err = memcg_update_all_list_lrus(size);
2850 if (!err)
2851 memcg_nr_cache_ids = size;
2852
2853 up_write(&memcg_cache_ids_sem);
2854
2855 if (err) {
2856 ida_simple_remove(&memcg_cache_ida, id);
2857 return err;
2858 }
2859 return id;
2860 }
2861
2862 static void memcg_free_cache_id(int id)
2863 {
2864 ida_simple_remove(&memcg_cache_ida, id);
2865 }
2866
2867 struct memcg_kmem_cache_create_work {
2868 struct mem_cgroup *memcg;
2869 struct kmem_cache *cachep;
2870 struct work_struct work;
2871 };
2872
2873 static void memcg_kmem_cache_create_func(struct work_struct *w)
2874 {
2875 struct memcg_kmem_cache_create_work *cw =
2876 container_of(w, struct memcg_kmem_cache_create_work, work);
2877 struct mem_cgroup *memcg = cw->memcg;
2878 struct kmem_cache *cachep = cw->cachep;
2879
2880 memcg_create_kmem_cache(memcg, cachep);
2881
2882 css_put(&memcg->css);
2883 kfree(cw);
2884 }
2885
2886
2887
2888
2889 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2890 struct kmem_cache *cachep)
2891 {
2892 struct memcg_kmem_cache_create_work *cw;
2893
2894 if (!css_tryget_online(&memcg->css))
2895 return;
2896
2897 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2898 if (!cw)
2899 return;
2900
2901 cw->memcg = memcg;
2902 cw->cachep = cachep;
2903 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2904
2905 queue_work(memcg_kmem_cache_wq, &cw->work);
2906 }
2907
2908 static inline bool memcg_kmem_bypass(void)
2909 {
2910 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2911 return true;
2912 return false;
2913 }
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2932 {
2933 struct mem_cgroup *memcg;
2934 struct kmem_cache *memcg_cachep;
2935 struct memcg_cache_array *arr;
2936 int kmemcg_id;
2937
2938 VM_BUG_ON(!is_root_cache(cachep));
2939
2940 if (memcg_kmem_bypass())
2941 return cachep;
2942
2943 rcu_read_lock();
2944
2945 if (unlikely(current->active_memcg))
2946 memcg = current->active_memcg;
2947 else
2948 memcg = mem_cgroup_from_task(current);
2949
2950 if (!memcg || memcg == root_mem_cgroup)
2951 goto out_unlock;
2952
2953 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2954 if (kmemcg_id < 0)
2955 goto out_unlock;
2956
2957 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2958
2959
2960
2961
2962
2963
2964 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985 if (unlikely(!memcg_cachep))
2986 memcg_schedule_kmem_cache_create(memcg, cachep);
2987 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2988 cachep = memcg_cachep;
2989 out_unlock:
2990 rcu_read_unlock();
2991 return cachep;
2992 }
2993
2994
2995
2996
2997
2998 void memcg_kmem_put_cache(struct kmem_cache *cachep)
2999 {
3000 if (!is_root_cache(cachep))
3001 percpu_ref_put(&cachep->memcg_params.refcnt);
3002 }
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
3014 struct mem_cgroup *memcg)
3015 {
3016 unsigned int nr_pages = 1 << order;
3017 struct page_counter *counter;
3018 int ret;
3019
3020 ret = try_charge(memcg, gfp, nr_pages);
3021 if (ret)
3022 return ret;
3023
3024 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
3025 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
3026
3027
3028
3029
3030
3031
3032 if (gfp & __GFP_NOFAIL) {
3033 page_counter_charge(&memcg->kmem, nr_pages);
3034 return 0;
3035 }
3036 cancel_charge(memcg, nr_pages);
3037 return -ENOMEM;
3038 }
3039 return 0;
3040 }
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050 int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
3051 {
3052 struct mem_cgroup *memcg;
3053 int ret = 0;
3054
3055 if (memcg_kmem_bypass())
3056 return 0;
3057
3058 memcg = get_mem_cgroup_from_current();
3059 if (!mem_cgroup_is_root(memcg)) {
3060 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
3061 if (!ret) {
3062 page->mem_cgroup = memcg;
3063 __SetPageKmemcg(page);
3064 }
3065 }
3066 css_put(&memcg->css);
3067 return ret;
3068 }
3069
3070
3071
3072
3073
3074
3075 void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
3076 unsigned int nr_pages)
3077 {
3078 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3079 page_counter_uncharge(&memcg->kmem, nr_pages);
3080
3081 page_counter_uncharge(&memcg->memory, nr_pages);
3082 if (do_memsw_account())
3083 page_counter_uncharge(&memcg->memsw, nr_pages);
3084 }
3085
3086
3087
3088
3089
3090 void __memcg_kmem_uncharge(struct page *page, int order)
3091 {
3092 struct mem_cgroup *memcg = page->mem_cgroup;
3093 unsigned int nr_pages = 1 << order;
3094
3095 if (!memcg)
3096 return;
3097
3098 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3099 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
3100 page->mem_cgroup = NULL;
3101
3102
3103 if (PageKmemcg(page))
3104 __ClearPageKmemcg(page);
3105
3106 css_put_many(&memcg->css, nr_pages);
3107 }
3108 #endif
3109
3110 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3111
3112
3113
3114
3115
3116 void mem_cgroup_split_huge_fixup(struct page *head)
3117 {
3118 int i;
3119
3120 if (mem_cgroup_disabled())
3121 return;
3122
3123 for (i = 1; i < HPAGE_PMD_NR; i++)
3124 head[i].mem_cgroup = head->mem_cgroup;
3125
3126 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3127 }
3128 #endif
3129
3130 #ifdef CONFIG_MEMCG_SWAP
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3146 struct mem_cgroup *from, struct mem_cgroup *to)
3147 {
3148 unsigned short old_id, new_id;
3149
3150 old_id = mem_cgroup_id(from);
3151 new_id = mem_cgroup_id(to);
3152
3153 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3154 mod_memcg_state(from, MEMCG_SWAP, -1);
3155 mod_memcg_state(to, MEMCG_SWAP, 1);
3156 return 0;
3157 }
3158 return -EINVAL;
3159 }
3160 #else
3161 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3162 struct mem_cgroup *from, struct mem_cgroup *to)
3163 {
3164 return -EINVAL;
3165 }
3166 #endif
3167
3168 static DEFINE_MUTEX(memcg_max_mutex);
3169
3170 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3171 unsigned long max, bool memsw)
3172 {
3173 bool enlarge = false;
3174 bool drained = false;
3175 int ret;
3176 bool limits_invariant;
3177 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3178
3179 do {
3180 if (signal_pending(current)) {
3181 ret = -EINTR;
3182 break;
3183 }
3184
3185 mutex_lock(&memcg_max_mutex);
3186
3187
3188
3189
3190 limits_invariant = memsw ? max >= memcg->memory.max :
3191 max <= memcg->memsw.max;
3192 if (!limits_invariant) {
3193 mutex_unlock(&memcg_max_mutex);
3194 ret = -EINVAL;
3195 break;
3196 }
3197 if (max > counter->max)
3198 enlarge = true;
3199 ret = page_counter_set_max(counter, max);
3200 mutex_unlock(&memcg_max_mutex);
3201
3202 if (!ret)
3203 break;
3204
3205 if (!drained) {
3206 drain_all_stock(memcg);
3207 drained = true;
3208 continue;
3209 }
3210
3211 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3212 GFP_KERNEL, !memsw)) {
3213 ret = -EBUSY;
3214 break;
3215 }
3216 } while (true);
3217
3218 if (!ret && enlarge)
3219 memcg_oom_recover(memcg);
3220
3221 return ret;
3222 }
3223
3224 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3225 gfp_t gfp_mask,
3226 unsigned long *total_scanned)
3227 {
3228 unsigned long nr_reclaimed = 0;
3229 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3230 unsigned long reclaimed;
3231 int loop = 0;
3232 struct mem_cgroup_tree_per_node *mctz;
3233 unsigned long excess;
3234 unsigned long nr_scanned;
3235
3236 if (order > 0)
3237 return 0;
3238
3239 mctz = soft_limit_tree_node(pgdat->node_id);
3240
3241
3242
3243
3244
3245
3246 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3247 return 0;
3248
3249
3250
3251
3252
3253
3254 do {
3255 if (next_mz)
3256 mz = next_mz;
3257 else
3258 mz = mem_cgroup_largest_soft_limit_node(mctz);
3259 if (!mz)
3260 break;
3261
3262 nr_scanned = 0;
3263 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3264 gfp_mask, &nr_scanned);
3265 nr_reclaimed += reclaimed;
3266 *total_scanned += nr_scanned;
3267 spin_lock_irq(&mctz->lock);
3268 __mem_cgroup_remove_exceeded(mz, mctz);
3269
3270
3271
3272
3273
3274 next_mz = NULL;
3275 if (!reclaimed)
3276 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3277
3278 excess = soft_limit_excess(mz->memcg);
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3289 spin_unlock_irq(&mctz->lock);
3290 css_put(&mz->memcg->css);
3291 loop++;
3292
3293
3294
3295
3296
3297 if (!nr_reclaimed &&
3298 (next_mz == NULL ||
3299 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3300 break;
3301 } while (!nr_reclaimed);
3302 if (next_mz)
3303 css_put(&next_mz->memcg->css);
3304 return nr_reclaimed;
3305 }
3306
3307
3308
3309
3310
3311
3312
3313 static inline bool memcg_has_children(struct mem_cgroup *memcg)
3314 {
3315 bool ret;
3316
3317 rcu_read_lock();
3318 ret = css_next_child(NULL, &memcg->css);
3319 rcu_read_unlock();
3320 return ret;
3321 }
3322
3323
3324
3325
3326
3327
3328 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3329 {
3330 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3331
3332
3333 lru_add_drain_all();
3334
3335 drain_all_stock(memcg);
3336
3337
3338 while (nr_retries && page_counter_read(&memcg->memory)) {
3339 int progress;
3340
3341 if (signal_pending(current))
3342 return -EINTR;
3343
3344 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3345 GFP_KERNEL, true);
3346 if (!progress) {
3347 nr_retries--;
3348
3349 congestion_wait(BLK_RW_ASYNC, HZ/10);
3350 }
3351
3352 }
3353
3354 return 0;
3355 }
3356
3357 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3358 char *buf, size_t nbytes,
3359 loff_t off)
3360 {
3361 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3362
3363 if (mem_cgroup_is_root(memcg))
3364 return -EINVAL;
3365 return mem_cgroup_force_empty(memcg) ?: nbytes;
3366 }
3367
3368 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3369 struct cftype *cft)
3370 {
3371 return mem_cgroup_from_css(css)->use_hierarchy;
3372 }
3373
3374 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3375 struct cftype *cft, u64 val)
3376 {
3377 int retval = 0;
3378 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3379 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3380
3381 if (memcg->use_hierarchy == val)
3382 return 0;
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3393 (val == 1 || val == 0)) {
3394 if (!memcg_has_children(memcg))
3395 memcg->use_hierarchy = val;
3396 else
3397 retval = -EBUSY;
3398 } else
3399 retval = -EINVAL;
3400
3401 return retval;
3402 }
3403
3404 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3405 {
3406 unsigned long val;
3407
3408 if (mem_cgroup_is_root(memcg)) {
3409 val = memcg_page_state(memcg, MEMCG_CACHE) +
3410 memcg_page_state(memcg, MEMCG_RSS);
3411 if (swap)
3412 val += memcg_page_state(memcg, MEMCG_SWAP);
3413 } else {
3414 if (!swap)
3415 val = page_counter_read(&memcg->memory);
3416 else
3417 val = page_counter_read(&memcg->memsw);
3418 }
3419 return val;
3420 }
3421
3422 enum {
3423 RES_USAGE,
3424 RES_LIMIT,
3425 RES_MAX_USAGE,
3426 RES_FAILCNT,
3427 RES_SOFT_LIMIT,
3428 };
3429
3430 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3431 struct cftype *cft)
3432 {
3433 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3434 struct page_counter *counter;
3435
3436 switch (MEMFILE_TYPE(cft->private)) {
3437 case _MEM:
3438 counter = &memcg->memory;
3439 break;
3440 case _MEMSWAP:
3441 counter = &memcg->memsw;
3442 break;
3443 case _KMEM:
3444 counter = &memcg->kmem;
3445 break;
3446 case _TCP:
3447 counter = &memcg->tcpmem;
3448 break;
3449 default:
3450 BUG();
3451 }
3452
3453 switch (MEMFILE_ATTR(cft->private)) {
3454 case RES_USAGE:
3455 if (counter == &memcg->memory)
3456 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3457 if (counter == &memcg->memsw)
3458 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3459 return (u64)page_counter_read(counter) * PAGE_SIZE;
3460 case RES_LIMIT:
3461 return (u64)counter->max * PAGE_SIZE;
3462 case RES_MAX_USAGE:
3463 return (u64)counter->watermark * PAGE_SIZE;
3464 case RES_FAILCNT:
3465 return counter->failcnt;
3466 case RES_SOFT_LIMIT:
3467 return (u64)memcg->soft_limit * PAGE_SIZE;
3468 default:
3469 BUG();
3470 }
3471 }
3472
3473 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3474 {
3475 unsigned long stat[MEMCG_NR_STAT] = {0};
3476 struct mem_cgroup *mi;
3477 int node, cpu, i;
3478
3479 for_each_online_cpu(cpu)
3480 for (i = 0; i < MEMCG_NR_STAT; i++)
3481 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3482
3483 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3484 for (i = 0; i < MEMCG_NR_STAT; i++)
3485 atomic_long_add(stat[i], &mi->vmstats[i]);
3486
3487 for_each_node(node) {
3488 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3489 struct mem_cgroup_per_node *pi;
3490
3491 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3492 stat[i] = 0;
3493
3494 for_each_online_cpu(cpu)
3495 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3496 stat[i] += per_cpu(
3497 pn->lruvec_stat_cpu->count[i], cpu);
3498
3499 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3500 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3501 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3502 }
3503 }
3504
3505 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3506 {
3507 unsigned long events[NR_VM_EVENT_ITEMS];
3508 struct mem_cgroup *mi;
3509 int cpu, i;
3510
3511 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3512 events[i] = 0;
3513
3514 for_each_online_cpu(cpu)
3515 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3516 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3517 cpu);
3518
3519 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3520 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3521 atomic_long_add(events[i], &mi->vmevents[i]);
3522 }
3523
3524 #ifdef CONFIG_MEMCG_KMEM
3525 static int memcg_online_kmem(struct mem_cgroup *memcg)
3526 {
3527 int memcg_id;
3528
3529 if (cgroup_memory_nokmem)
3530 return 0;
3531
3532 BUG_ON(memcg->kmemcg_id >= 0);
3533 BUG_ON(memcg->kmem_state);
3534
3535 memcg_id = memcg_alloc_cache_id();
3536 if (memcg_id < 0)
3537 return memcg_id;
3538
3539 static_branch_inc(&memcg_kmem_enabled_key);
3540
3541
3542
3543
3544
3545
3546 memcg->kmemcg_id = memcg_id;
3547 memcg->kmem_state = KMEM_ONLINE;
3548 INIT_LIST_HEAD(&memcg->kmem_caches);
3549
3550 return 0;
3551 }
3552
3553 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3554 {
3555 struct cgroup_subsys_state *css;
3556 struct mem_cgroup *parent, *child;
3557 int kmemcg_id;
3558
3559 if (memcg->kmem_state != KMEM_ONLINE)
3560 return;
3561
3562
3563
3564
3565
3566
3567 memcg->kmem_state = KMEM_ALLOCATED;
3568
3569 parent = parent_mem_cgroup(memcg);
3570 if (!parent)
3571 parent = root_mem_cgroup;
3572
3573
3574
3575
3576 memcg_deactivate_kmem_caches(memcg, parent);
3577
3578 kmemcg_id = memcg->kmemcg_id;
3579 BUG_ON(kmemcg_id < 0);
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589 rcu_read_lock();
3590 css_for_each_descendant_pre(css, &memcg->css) {
3591 child = mem_cgroup_from_css(css);
3592 BUG_ON(child->kmemcg_id != kmemcg_id);
3593 child->kmemcg_id = parent->kmemcg_id;
3594 if (!memcg->use_hierarchy)
3595 break;
3596 }
3597 rcu_read_unlock();
3598
3599 memcg_drain_all_list_lrus(kmemcg_id, parent);
3600
3601 memcg_free_cache_id(kmemcg_id);
3602 }
3603
3604 static void memcg_free_kmem(struct mem_cgroup *memcg)
3605 {
3606
3607 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3608 memcg_offline_kmem(memcg);
3609
3610 if (memcg->kmem_state == KMEM_ALLOCATED) {
3611 WARN_ON(!list_empty(&memcg->kmem_caches));
3612 static_branch_dec(&memcg_kmem_enabled_key);
3613 }
3614 }
3615 #else
3616 static int memcg_online_kmem(struct mem_cgroup *memcg)
3617 {
3618 return 0;
3619 }
3620 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3621 {
3622 }
3623 static void memcg_free_kmem(struct mem_cgroup *memcg)
3624 {
3625 }
3626 #endif
3627
3628 static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3629 unsigned long max)
3630 {
3631 int ret;
3632
3633 mutex_lock(&memcg_max_mutex);
3634 ret = page_counter_set_max(&memcg->kmem, max);
3635 mutex_unlock(&memcg_max_mutex);
3636 return ret;
3637 }
3638
3639 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3640 {
3641 int ret;
3642
3643 mutex_lock(&memcg_max_mutex);
3644
3645 ret = page_counter_set_max(&memcg->tcpmem, max);
3646 if (ret)
3647 goto out;
3648
3649 if (!memcg->tcpmem_active) {
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666 static_branch_inc(&memcg_sockets_enabled_key);
3667 memcg->tcpmem_active = true;
3668 }
3669 out:
3670 mutex_unlock(&memcg_max_mutex);
3671 return ret;
3672 }
3673
3674
3675
3676
3677
3678 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3679 char *buf, size_t nbytes, loff_t off)
3680 {
3681 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3682 unsigned long nr_pages;
3683 int ret;
3684
3685 buf = strstrip(buf);
3686 ret = page_counter_memparse(buf, "-1", &nr_pages);
3687 if (ret)
3688 return ret;
3689
3690 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3691 case RES_LIMIT:
3692 if (mem_cgroup_is_root(memcg)) {
3693 ret = -EINVAL;
3694 break;
3695 }
3696 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3697 case _MEM:
3698 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3699 break;
3700 case _MEMSWAP:
3701 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3702 break;
3703 case _KMEM:
3704 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3705 "Please report your usecase to linux-mm@kvack.org if you "
3706 "depend on this functionality.\n");
3707 ret = memcg_update_kmem_max(memcg, nr_pages);
3708 break;
3709 case _TCP:
3710 ret = memcg_update_tcp_max(memcg, nr_pages);
3711 break;
3712 }
3713 break;
3714 case RES_SOFT_LIMIT:
3715 memcg->soft_limit = nr_pages;
3716 ret = 0;
3717 break;
3718 }
3719 return ret ?: nbytes;
3720 }
3721
3722 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3723 size_t nbytes, loff_t off)
3724 {
3725 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3726 struct page_counter *counter;
3727
3728 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3729 case _MEM:
3730 counter = &memcg->memory;
3731 break;
3732 case _MEMSWAP:
3733 counter = &memcg->memsw;
3734 break;
3735 case _KMEM:
3736 counter = &memcg->kmem;
3737 break;
3738 case _TCP:
3739 counter = &memcg->tcpmem;
3740 break;
3741 default:
3742 BUG();
3743 }
3744
3745 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3746 case RES_MAX_USAGE:
3747 page_counter_reset_watermark(counter);
3748 break;
3749 case RES_FAILCNT:
3750 counter->failcnt = 0;
3751 break;
3752 default:
3753 BUG();
3754 }
3755
3756 return nbytes;
3757 }
3758
3759 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3760 struct cftype *cft)
3761 {
3762 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3763 }
3764
3765 #ifdef CONFIG_MMU
3766 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3767 struct cftype *cft, u64 val)
3768 {
3769 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3770
3771 if (val & ~MOVE_MASK)
3772 return -EINVAL;
3773
3774
3775
3776
3777
3778
3779
3780 memcg->move_charge_at_immigrate = val;
3781 return 0;
3782 }
3783 #else
3784 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3785 struct cftype *cft, u64 val)
3786 {
3787 return -ENOSYS;
3788 }
3789 #endif
3790
3791 #ifdef CONFIG_NUMA
3792
3793 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3794 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3795 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3796
3797 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3798 int nid, unsigned int lru_mask)
3799 {
3800 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
3801 unsigned long nr = 0;
3802 enum lru_list lru;
3803
3804 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3805
3806 for_each_lru(lru) {
3807 if (!(BIT(lru) & lru_mask))
3808 continue;
3809 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3810 }
3811 return nr;
3812 }
3813
3814 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3815 unsigned int lru_mask)
3816 {
3817 unsigned long nr = 0;
3818 enum lru_list lru;
3819
3820 for_each_lru(lru) {
3821 if (!(BIT(lru) & lru_mask))
3822 continue;
3823 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3824 }
3825 return nr;
3826 }
3827
3828 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3829 {
3830 struct numa_stat {
3831 const char *name;
3832 unsigned int lru_mask;
3833 };
3834
3835 static const struct numa_stat stats[] = {
3836 { "total", LRU_ALL },
3837 { "file", LRU_ALL_FILE },
3838 { "anon", LRU_ALL_ANON },
3839 { "unevictable", BIT(LRU_UNEVICTABLE) },
3840 };
3841 const struct numa_stat *stat;
3842 int nid;
3843 unsigned long nr;
3844 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3845
3846 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3847 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3848 seq_printf(m, "%s=%lu", stat->name, nr);
3849 for_each_node_state(nid, N_MEMORY) {
3850 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3851 stat->lru_mask);
3852 seq_printf(m, " N%d=%lu", nid, nr);
3853 }
3854 seq_putc(m, '\n');
3855 }
3856
3857 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3858 struct mem_cgroup *iter;
3859
3860 nr = 0;
3861 for_each_mem_cgroup_tree(iter, memcg)
3862 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3863 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3864 for_each_node_state(nid, N_MEMORY) {
3865 nr = 0;
3866 for_each_mem_cgroup_tree(iter, memcg)
3867 nr += mem_cgroup_node_nr_lru_pages(
3868 iter, nid, stat->lru_mask);
3869 seq_printf(m, " N%d=%lu", nid, nr);
3870 }
3871 seq_putc(m, '\n');
3872 }
3873
3874 return 0;
3875 }
3876 #endif
3877
3878 static const unsigned int memcg1_stats[] = {
3879 MEMCG_CACHE,
3880 MEMCG_RSS,
3881 MEMCG_RSS_HUGE,
3882 NR_SHMEM,
3883 NR_FILE_MAPPED,
3884 NR_FILE_DIRTY,
3885 NR_WRITEBACK,
3886 MEMCG_SWAP,
3887 };
3888
3889 static const char *const memcg1_stat_names[] = {
3890 "cache",
3891 "rss",
3892 "rss_huge",
3893 "shmem",
3894 "mapped_file",
3895 "dirty",
3896 "writeback",
3897 "swap",
3898 };
3899
3900
3901 static const unsigned int memcg1_events[] = {
3902 PGPGIN,
3903 PGPGOUT,
3904 PGFAULT,
3905 PGMAJFAULT,
3906 };
3907
3908 static const char *const memcg1_event_names[] = {
3909 "pgpgin",
3910 "pgpgout",
3911 "pgfault",
3912 "pgmajfault",
3913 };
3914
3915 static int memcg_stat_show(struct seq_file *m, void *v)
3916 {
3917 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3918 unsigned long memory, memsw;
3919 struct mem_cgroup *mi;
3920 unsigned int i;
3921
3922 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3923 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3924
3925 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3926 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3927 continue;
3928 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3929 memcg_page_state_local(memcg, memcg1_stats[i]) *
3930 PAGE_SIZE);
3931 }
3932
3933 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3934 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3935 memcg_events_local(memcg, memcg1_events[i]));
3936
3937 for (i = 0; i < NR_LRU_LISTS; i++)
3938 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3939 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3940 PAGE_SIZE);
3941
3942
3943 memory = memsw = PAGE_COUNTER_MAX;
3944 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3945 memory = min(memory, mi->memory.max);
3946 memsw = min(memsw, mi->memsw.max);
3947 }
3948 seq_printf(m, "hierarchical_memory_limit %llu\n",
3949 (u64)memory * PAGE_SIZE);
3950 if (do_memsw_account())
3951 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3952 (u64)memsw * PAGE_SIZE);
3953
3954 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3955 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3956 continue;
3957 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3958 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3959 PAGE_SIZE);
3960 }
3961
3962 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3963 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3964 (u64)memcg_events(memcg, memcg1_events[i]));
3965
3966 for (i = 0; i < NR_LRU_LISTS; i++)
3967 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3968 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3969 PAGE_SIZE);
3970
3971 #ifdef CONFIG_DEBUG_VM
3972 {
3973 pg_data_t *pgdat;
3974 struct mem_cgroup_per_node *mz;
3975 struct zone_reclaim_stat *rstat;
3976 unsigned long recent_rotated[2] = {0, 0};
3977 unsigned long recent_scanned[2] = {0, 0};
3978
3979 for_each_online_pgdat(pgdat) {
3980 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3981 rstat = &mz->lruvec.reclaim_stat;
3982
3983 recent_rotated[0] += rstat->recent_rotated[0];
3984 recent_rotated[1] += rstat->recent_rotated[1];
3985 recent_scanned[0] += rstat->recent_scanned[0];
3986 recent_scanned[1] += rstat->recent_scanned[1];
3987 }
3988 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3989 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3990 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3991 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3992 }
3993 #endif
3994
3995 return 0;
3996 }
3997
3998 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3999 struct cftype *cft)
4000 {
4001 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4002
4003 return mem_cgroup_swappiness(memcg);
4004 }
4005
4006 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4007 struct cftype *cft, u64 val)
4008 {
4009 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4010
4011 if (val > 100)
4012 return -EINVAL;
4013
4014 if (css->parent)
4015 memcg->swappiness = val;
4016 else
4017 vm_swappiness = val;
4018
4019 return 0;
4020 }
4021
4022 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4023 {
4024 struct mem_cgroup_threshold_ary *t;
4025 unsigned long usage;
4026 int i;
4027
4028 rcu_read_lock();
4029 if (!swap)
4030 t = rcu_dereference(memcg->thresholds.primary);
4031 else
4032 t = rcu_dereference(memcg->memsw_thresholds.primary);
4033
4034 if (!t)
4035 goto unlock;
4036
4037 usage = mem_cgroup_usage(memcg, swap);
4038
4039
4040
4041
4042
4043
4044 i = t->current_threshold;
4045
4046
4047
4048
4049
4050
4051
4052 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4053 eventfd_signal(t->entries[i].eventfd, 1);
4054
4055
4056 i++;
4057
4058
4059
4060
4061
4062
4063
4064 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4065 eventfd_signal(t->entries[i].eventfd, 1);
4066
4067
4068 t->current_threshold = i - 1;
4069 unlock:
4070 rcu_read_unlock();
4071 }
4072
4073 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4074 {
4075 while (memcg) {
4076 __mem_cgroup_threshold(memcg, false);
4077 if (do_memsw_account())
4078 __mem_cgroup_threshold(memcg, true);
4079
4080 memcg = parent_mem_cgroup(memcg);
4081 }
4082 }
4083
4084 static int compare_thresholds(const void *a, const void *b)
4085 {
4086 const struct mem_cgroup_threshold *_a = a;
4087 const struct mem_cgroup_threshold *_b = b;
4088
4089 if (_a->threshold > _b->threshold)
4090 return 1;
4091
4092 if (_a->threshold < _b->threshold)
4093 return -1;
4094
4095 return 0;
4096 }
4097
4098 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4099 {
4100 struct mem_cgroup_eventfd_list *ev;
4101
4102 spin_lock(&memcg_oom_lock);
4103
4104 list_for_each_entry(ev, &memcg->oom_notify, list)
4105 eventfd_signal(ev->eventfd, 1);
4106
4107 spin_unlock(&memcg_oom_lock);
4108 return 0;
4109 }
4110
4111 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4112 {
4113 struct mem_cgroup *iter;
4114
4115 for_each_mem_cgroup_tree(iter, memcg)
4116 mem_cgroup_oom_notify_cb(iter);
4117 }
4118
4119 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4120 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4121 {
4122 struct mem_cgroup_thresholds *thresholds;
4123 struct mem_cgroup_threshold_ary *new;
4124 unsigned long threshold;
4125 unsigned long usage;
4126 int i, size, ret;
4127
4128 ret = page_counter_memparse(args, "-1", &threshold);
4129 if (ret)
4130 return ret;
4131
4132 mutex_lock(&memcg->thresholds_lock);
4133
4134 if (type == _MEM) {
4135 thresholds = &memcg->thresholds;
4136 usage = mem_cgroup_usage(memcg, false);
4137 } else if (type == _MEMSWAP) {
4138 thresholds = &memcg->memsw_thresholds;
4139 usage = mem_cgroup_usage(memcg, true);
4140 } else
4141 BUG();
4142
4143
4144 if (thresholds->primary)
4145 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4146
4147 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4148
4149
4150 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4151 if (!new) {
4152 ret = -ENOMEM;
4153 goto unlock;
4154 }
4155 new->size = size;
4156
4157
4158 if (thresholds->primary) {
4159 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4160 sizeof(struct mem_cgroup_threshold));
4161 }
4162
4163
4164 new->entries[size - 1].eventfd = eventfd;
4165 new->entries[size - 1].threshold = threshold;
4166
4167
4168 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4169 compare_thresholds, NULL);
4170
4171
4172 new->current_threshold = -1;
4173 for (i = 0; i < size; i++) {
4174 if (new->entries[i].threshold <= usage) {
4175
4176
4177
4178
4179
4180 ++new->current_threshold;
4181 } else
4182 break;
4183 }
4184
4185
4186 kfree(thresholds->spare);
4187 thresholds->spare = thresholds->primary;
4188
4189 rcu_assign_pointer(thresholds->primary, new);
4190
4191
4192 synchronize_rcu();
4193
4194 unlock:
4195 mutex_unlock(&memcg->thresholds_lock);
4196
4197 return ret;
4198 }
4199
4200 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4201 struct eventfd_ctx *eventfd, const char *args)
4202 {
4203 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4204 }
4205
4206 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4207 struct eventfd_ctx *eventfd, const char *args)
4208 {
4209 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4210 }
4211
4212 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4213 struct eventfd_ctx *eventfd, enum res_type type)
4214 {
4215 struct mem_cgroup_thresholds *thresholds;
4216 struct mem_cgroup_threshold_ary *new;
4217 unsigned long usage;
4218 int i, j, size, entries;
4219
4220 mutex_lock(&memcg->thresholds_lock);
4221
4222 if (type == _MEM) {
4223 thresholds = &memcg->thresholds;
4224 usage = mem_cgroup_usage(memcg, false);
4225 } else if (type == _MEMSWAP) {
4226 thresholds = &memcg->memsw_thresholds;
4227 usage = mem_cgroup_usage(memcg, true);
4228 } else
4229 BUG();
4230
4231 if (!thresholds->primary)
4232 goto unlock;
4233
4234
4235 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4236
4237
4238 size = entries = 0;
4239 for (i = 0; i < thresholds->primary->size; i++) {
4240 if (thresholds->primary->entries[i].eventfd != eventfd)
4241 size++;
4242 else
4243 entries++;
4244 }
4245
4246 new = thresholds->spare;
4247
4248
4249 if (!entries)
4250 goto unlock;
4251
4252
4253 if (!size) {
4254 kfree(new);
4255 new = NULL;
4256 goto swap_buffers;
4257 }
4258
4259 new->size = size;
4260
4261
4262 new->current_threshold = -1;
4263 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4264 if (thresholds->primary->entries[i].eventfd == eventfd)
4265 continue;
4266
4267 new->entries[j] = thresholds->primary->entries[i];
4268 if (new->entries[j].threshold <= usage) {
4269
4270
4271
4272
4273
4274 ++new->current_threshold;
4275 }
4276 j++;
4277 }
4278
4279 swap_buffers:
4280
4281 thresholds->spare = thresholds->primary;
4282
4283 rcu_assign_pointer(thresholds->primary, new);
4284
4285
4286 synchronize_rcu();
4287
4288
4289 if (!new) {
4290 kfree(thresholds->spare);
4291 thresholds->spare = NULL;
4292 }
4293 unlock:
4294 mutex_unlock(&memcg->thresholds_lock);
4295 }
4296
4297 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4298 struct eventfd_ctx *eventfd)
4299 {
4300 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4301 }
4302
4303 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4304 struct eventfd_ctx *eventfd)
4305 {
4306 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4307 }
4308
4309 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4310 struct eventfd_ctx *eventfd, const char *args)
4311 {
4312 struct mem_cgroup_eventfd_list *event;
4313
4314 event = kmalloc(sizeof(*event), GFP_KERNEL);
4315 if (!event)
4316 return -ENOMEM;
4317
4318 spin_lock(&memcg_oom_lock);
4319
4320 event->eventfd = eventfd;
4321 list_add(&event->list, &memcg->oom_notify);
4322
4323
4324 if (memcg->under_oom)
4325 eventfd_signal(eventfd, 1);
4326 spin_unlock(&memcg_oom_lock);
4327
4328 return 0;
4329 }
4330
4331 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4332 struct eventfd_ctx *eventfd)
4333 {
4334 struct mem_cgroup_eventfd_list *ev, *tmp;
4335
4336 spin_lock(&memcg_oom_lock);
4337
4338 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4339 if (ev->eventfd == eventfd) {
4340 list_del(&ev->list);
4341 kfree(ev);
4342 }
4343 }
4344
4345 spin_unlock(&memcg_oom_lock);
4346 }
4347
4348 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4349 {
4350 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4351
4352 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4353 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4354 seq_printf(sf, "oom_kill %lu\n",
4355 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4356 return 0;
4357 }
4358
4359 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4360 struct cftype *cft, u64 val)
4361 {
4362 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4363
4364
4365 if (!css->parent || !((val == 0) || (val == 1)))
4366 return -EINVAL;
4367
4368 memcg->oom_kill_disable = val;
4369 if (!val)
4370 memcg_oom_recover(memcg);
4371
4372 return 0;
4373 }
4374
4375 #ifdef CONFIG_CGROUP_WRITEBACK
4376
4377 #include <trace/events/writeback.h>
4378
4379 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4380 {
4381 return wb_domain_init(&memcg->cgwb_domain, gfp);
4382 }
4383
4384 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4385 {
4386 wb_domain_exit(&memcg->cgwb_domain);
4387 }
4388
4389 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4390 {
4391 wb_domain_size_changed(&memcg->cgwb_domain);
4392 }
4393
4394 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4395 {
4396 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4397
4398 if (!memcg->css.parent)
4399 return NULL;
4400
4401 return &memcg->cgwb_domain;
4402 }
4403
4404
4405
4406
4407
4408 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4409 {
4410 long x = atomic_long_read(&memcg->vmstats[idx]);
4411 int cpu;
4412
4413 for_each_online_cpu(cpu)
4414 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4415 if (x < 0)
4416 x = 0;
4417 return x;
4418 }
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4439 unsigned long *pheadroom, unsigned long *pdirty,
4440 unsigned long *pwriteback)
4441 {
4442 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4443 struct mem_cgroup *parent;
4444
4445 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4446
4447
4448 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4449 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4450 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4451 *pheadroom = PAGE_COUNTER_MAX;
4452
4453 while ((parent = parent_mem_cgroup(memcg))) {
4454 unsigned long ceiling = min(memcg->memory.max, memcg->high);
4455 unsigned long used = page_counter_read(&memcg->memory);
4456
4457 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4458 memcg = parent;
4459 }
4460 }
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4507 struct bdi_writeback *wb)
4508 {
4509 struct mem_cgroup *memcg = page->mem_cgroup;
4510 struct memcg_cgwb_frn *frn;
4511 u64 now = get_jiffies_64();
4512 u64 oldest_at = now;
4513 int oldest = -1;
4514 int i;
4515
4516 trace_track_foreign_dirty(page, wb);
4517
4518
4519
4520
4521
4522
4523 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4524 frn = &memcg->cgwb_frn[i];
4525 if (frn->bdi_id == wb->bdi->id &&
4526 frn->memcg_id == wb->memcg_css->id)
4527 break;
4528 if (time_before64(frn->at, oldest_at) &&
4529 atomic_read(&frn->done.cnt) == 1) {
4530 oldest = i;
4531 oldest_at = frn->at;
4532 }
4533 }
4534
4535 if (i < MEMCG_CGWB_FRN_CNT) {
4536
4537
4538
4539
4540
4541
4542
4543 unsigned long update_intv =
4544 min_t(unsigned long, HZ,
4545 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4546
4547 if (time_before64(frn->at, now - update_intv))
4548 frn->at = now;
4549 } else if (oldest >= 0) {
4550
4551 frn = &memcg->cgwb_frn[oldest];
4552 frn->bdi_id = wb->bdi->id;
4553 frn->memcg_id = wb->memcg_css->id;
4554 frn->at = now;
4555 }
4556 }
4557
4558
4559 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4560 {
4561 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4562 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4563 u64 now = jiffies_64;
4564 int i;
4565
4566 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4567 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4568
4569
4570
4571
4572
4573
4574
4575 if (time_after64(frn->at, now - intv) &&
4576 atomic_read(&frn->done.cnt) == 1) {
4577 frn->at = 0;
4578 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4579 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4580 WB_REASON_FOREIGN_FLUSH,
4581 &frn->done);
4582 }
4583 }
4584 }
4585
4586 #else
4587
4588 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4589 {
4590 return 0;
4591 }
4592
4593 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4594 {
4595 }
4596
4597 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4598 {
4599 }
4600
4601 #endif
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621 static void memcg_event_remove(struct work_struct *work)
4622 {
4623 struct mem_cgroup_event *event =
4624 container_of(work, struct mem_cgroup_event, remove);
4625 struct mem_cgroup *memcg = event->memcg;
4626
4627 remove_wait_queue(event->wqh, &event->wait);
4628
4629 event->unregister_event(memcg, event->eventfd);
4630
4631
4632 eventfd_signal(event->eventfd, 1);
4633
4634 eventfd_ctx_put(event->eventfd);
4635 kfree(event);
4636 css_put(&memcg->css);
4637 }
4638
4639
4640
4641
4642
4643
4644 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4645 int sync, void *key)
4646 {
4647 struct mem_cgroup_event *event =
4648 container_of(wait, struct mem_cgroup_event, wait);
4649 struct mem_cgroup *memcg = event->memcg;
4650 __poll_t flags = key_to_poll(key);
4651
4652 if (flags & EPOLLHUP) {
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662 spin_lock(&memcg->event_list_lock);
4663 if (!list_empty(&event->list)) {
4664 list_del_init(&event->list);
4665
4666
4667
4668
4669 schedule_work(&event->remove);
4670 }
4671 spin_unlock(&memcg->event_list_lock);
4672 }
4673
4674 return 0;
4675 }
4676
4677 static void memcg_event_ptable_queue_proc(struct file *file,
4678 wait_queue_head_t *wqh, poll_table *pt)
4679 {
4680 struct mem_cgroup_event *event =
4681 container_of(pt, struct mem_cgroup_event, pt);
4682
4683 event->wqh = wqh;
4684 add_wait_queue(wqh, &event->wait);
4685 }
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4696 char *buf, size_t nbytes, loff_t off)
4697 {
4698 struct cgroup_subsys_state *css = of_css(of);
4699 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4700 struct mem_cgroup_event *event;
4701 struct cgroup_subsys_state *cfile_css;
4702 unsigned int efd, cfd;
4703 struct fd efile;
4704 struct fd cfile;
4705 const char *name;
4706 char *endp;
4707 int ret;
4708
4709 buf = strstrip(buf);
4710
4711 efd = simple_strtoul(buf, &endp, 10);
4712 if (*endp != ' ')
4713 return -EINVAL;
4714 buf = endp + 1;
4715
4716 cfd = simple_strtoul(buf, &endp, 10);
4717 if ((*endp != ' ') && (*endp != '\0'))
4718 return -EINVAL;
4719 buf = endp + 1;
4720
4721 event = kzalloc(sizeof(*event), GFP_KERNEL);
4722 if (!event)
4723 return -ENOMEM;
4724
4725 event->memcg = memcg;
4726 INIT_LIST_HEAD(&event->list);
4727 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4728 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4729 INIT_WORK(&event->remove, memcg_event_remove);
4730
4731 efile = fdget(efd);
4732 if (!efile.file) {
4733 ret = -EBADF;
4734 goto out_kfree;
4735 }
4736
4737 event->eventfd = eventfd_ctx_fileget(efile.file);
4738 if (IS_ERR(event->eventfd)) {
4739 ret = PTR_ERR(event->eventfd);
4740 goto out_put_efile;
4741 }
4742
4743 cfile = fdget(cfd);
4744 if (!cfile.file) {
4745 ret = -EBADF;
4746 goto out_put_eventfd;
4747 }
4748
4749
4750
4751 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4752 if (ret < 0)
4753 goto out_put_cfile;
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763 name = cfile.file->f_path.dentry->d_name.name;
4764
4765 if (!strcmp(name, "memory.usage_in_bytes")) {
4766 event->register_event = mem_cgroup_usage_register_event;
4767 event->unregister_event = mem_cgroup_usage_unregister_event;
4768 } else if (!strcmp(name, "memory.oom_control")) {
4769 event->register_event = mem_cgroup_oom_register_event;
4770 event->unregister_event = mem_cgroup_oom_unregister_event;
4771 } else if (!strcmp(name, "memory.pressure_level")) {
4772 event->register_event = vmpressure_register_event;
4773 event->unregister_event = vmpressure_unregister_event;
4774 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4775 event->register_event = memsw_cgroup_usage_register_event;
4776 event->unregister_event = memsw_cgroup_usage_unregister_event;
4777 } else {
4778 ret = -EINVAL;
4779 goto out_put_cfile;
4780 }
4781
4782
4783
4784
4785
4786
4787 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4788 &memory_cgrp_subsys);
4789 ret = -EINVAL;
4790 if (IS_ERR(cfile_css))
4791 goto out_put_cfile;
4792 if (cfile_css != css) {
4793 css_put(cfile_css);
4794 goto out_put_cfile;
4795 }
4796
4797 ret = event->register_event(memcg, event->eventfd, buf);
4798 if (ret)
4799 goto out_put_css;
4800
4801 vfs_poll(efile.file, &event->pt);
4802
4803 spin_lock(&memcg->event_list_lock);
4804 list_add(&event->list, &memcg->event_list);
4805 spin_unlock(&memcg->event_list_lock);
4806
4807 fdput(cfile);
4808 fdput(efile);
4809
4810 return nbytes;
4811
4812 out_put_css:
4813 css_put(css);
4814 out_put_cfile:
4815 fdput(cfile);
4816 out_put_eventfd:
4817 eventfd_ctx_put(event->eventfd);
4818 out_put_efile:
4819 fdput(efile);
4820 out_kfree:
4821 kfree(event);
4822
4823 return ret;
4824 }
4825
4826 static struct cftype mem_cgroup_legacy_files[] = {
4827 {
4828 .name = "usage_in_bytes",
4829 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4830 .read_u64 = mem_cgroup_read_u64,
4831 },
4832 {
4833 .name = "max_usage_in_bytes",
4834 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4835 .write = mem_cgroup_reset,
4836 .read_u64 = mem_cgroup_read_u64,
4837 },
4838 {
4839 .name = "limit_in_bytes",
4840 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4841 .write = mem_cgroup_write,
4842 .read_u64 = mem_cgroup_read_u64,
4843 },
4844 {
4845 .name = "soft_limit_in_bytes",
4846 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4847 .write = mem_cgroup_write,
4848 .read_u64 = mem_cgroup_read_u64,
4849 },
4850 {
4851 .name = "failcnt",
4852 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4853 .write = mem_cgroup_reset,
4854 .read_u64 = mem_cgroup_read_u64,
4855 },
4856 {
4857 .name = "stat",
4858 .seq_show = memcg_stat_show,
4859 },
4860 {
4861 .name = "force_empty",
4862 .write = mem_cgroup_force_empty_write,
4863 },
4864 {
4865 .name = "use_hierarchy",
4866 .write_u64 = mem_cgroup_hierarchy_write,
4867 .read_u64 = mem_cgroup_hierarchy_read,
4868 },
4869 {
4870 .name = "cgroup.event_control",
4871 .write = memcg_write_event_control,
4872 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4873 },
4874 {
4875 .name = "swappiness",
4876 .read_u64 = mem_cgroup_swappiness_read,
4877 .write_u64 = mem_cgroup_swappiness_write,
4878 },
4879 {
4880 .name = "move_charge_at_immigrate",
4881 .read_u64 = mem_cgroup_move_charge_read,
4882 .write_u64 = mem_cgroup_move_charge_write,
4883 },
4884 {
4885 .name = "oom_control",
4886 .seq_show = mem_cgroup_oom_control_read,
4887 .write_u64 = mem_cgroup_oom_control_write,
4888 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4889 },
4890 {
4891 .name = "pressure_level",
4892 },
4893 #ifdef CONFIG_NUMA
4894 {
4895 .name = "numa_stat",
4896 .seq_show = memcg_numa_stat_show,
4897 },
4898 #endif
4899 {
4900 .name = "kmem.limit_in_bytes",
4901 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4902 .write = mem_cgroup_write,
4903 .read_u64 = mem_cgroup_read_u64,
4904 },
4905 {
4906 .name = "kmem.usage_in_bytes",
4907 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4908 .read_u64 = mem_cgroup_read_u64,
4909 },
4910 {
4911 .name = "kmem.failcnt",
4912 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4913 .write = mem_cgroup_reset,
4914 .read_u64 = mem_cgroup_read_u64,
4915 },
4916 {
4917 .name = "kmem.max_usage_in_bytes",
4918 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4919 .write = mem_cgroup_reset,
4920 .read_u64 = mem_cgroup_read_u64,
4921 },
4922 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4923 {
4924 .name = "kmem.slabinfo",
4925 .seq_start = memcg_slab_start,
4926 .seq_next = memcg_slab_next,
4927 .seq_stop = memcg_slab_stop,
4928 .seq_show = memcg_slab_show,
4929 },
4930 #endif
4931 {
4932 .name = "kmem.tcp.limit_in_bytes",
4933 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4934 .write = mem_cgroup_write,
4935 .read_u64 = mem_cgroup_read_u64,
4936 },
4937 {
4938 .name = "kmem.tcp.usage_in_bytes",
4939 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4940 .read_u64 = mem_cgroup_read_u64,
4941 },
4942 {
4943 .name = "kmem.tcp.failcnt",
4944 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4945 .write = mem_cgroup_reset,
4946 .read_u64 = mem_cgroup_read_u64,
4947 },
4948 {
4949 .name = "kmem.tcp.max_usage_in_bytes",
4950 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4951 .write = mem_cgroup_reset,
4952 .read_u64 = mem_cgroup_read_u64,
4953 },
4954 { },
4955 };
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981 static DEFINE_IDR(mem_cgroup_idr);
4982
4983 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4984 {
4985 if (memcg->id.id > 0) {
4986 idr_remove(&mem_cgroup_idr, memcg->id.id);
4987 memcg->id.id = 0;
4988 }
4989 }
4990
4991 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4992 {
4993 refcount_add(n, &memcg->id.ref);
4994 }
4995
4996 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4997 {
4998 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4999 mem_cgroup_id_remove(memcg);
5000
5001
5002 css_put(&memcg->css);
5003 }
5004 }
5005
5006 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5007 {
5008 mem_cgroup_id_put_many(memcg, 1);
5009 }
5010
5011
5012
5013
5014
5015
5016
5017 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5018 {
5019 WARN_ON_ONCE(!rcu_read_lock_held());
5020 return idr_find(&mem_cgroup_idr, id);
5021 }
5022
5023 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5024 {
5025 struct mem_cgroup_per_node *pn;
5026 int tmp = node;
5027
5028
5029
5030
5031
5032
5033
5034
5035 if (!node_state(node, N_NORMAL_MEMORY))
5036 tmp = -1;
5037 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5038 if (!pn)
5039 return 1;
5040
5041 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
5042 if (!pn->lruvec_stat_local) {
5043 kfree(pn);
5044 return 1;
5045 }
5046
5047 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
5048 if (!pn->lruvec_stat_cpu) {
5049 free_percpu(pn->lruvec_stat_local);
5050 kfree(pn);
5051 return 1;
5052 }
5053
5054 lruvec_init(&pn->lruvec);
5055 pn->usage_in_excess = 0;
5056 pn->on_tree = false;
5057 pn->memcg = memcg;
5058
5059 memcg->nodeinfo[node] = pn;
5060 return 0;
5061 }
5062
5063 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5064 {
5065 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5066
5067 if (!pn)
5068 return;
5069
5070 free_percpu(pn->lruvec_stat_cpu);
5071 free_percpu(pn->lruvec_stat_local);
5072 kfree(pn);
5073 }
5074
5075 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5076 {
5077 int node;
5078
5079 for_each_node(node)
5080 free_mem_cgroup_per_node_info(memcg, node);
5081 free_percpu(memcg->vmstats_percpu);
5082 free_percpu(memcg->vmstats_local);
5083 kfree(memcg);
5084 }
5085
5086 static void mem_cgroup_free(struct mem_cgroup *memcg)
5087 {
5088 memcg_wb_domain_exit(memcg);
5089
5090
5091
5092
5093 memcg_flush_percpu_vmstats(memcg);
5094 memcg_flush_percpu_vmevents(memcg);
5095 __mem_cgroup_free(memcg);
5096 }
5097
5098 static struct mem_cgroup *mem_cgroup_alloc(void)
5099 {
5100 struct mem_cgroup *memcg;
5101 unsigned int size;
5102 int node;
5103 int __maybe_unused i;
5104 long error = -ENOMEM;
5105
5106 size = sizeof(struct mem_cgroup);
5107 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5108
5109 memcg = kzalloc(size, GFP_KERNEL);
5110 if (!memcg)
5111 return ERR_PTR(error);
5112
5113 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5114 1, MEM_CGROUP_ID_MAX,
5115 GFP_KERNEL);
5116 if (memcg->id.id < 0) {
5117 error = memcg->id.id;
5118 goto fail;
5119 }
5120
5121 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
5122 if (!memcg->vmstats_local)
5123 goto fail;
5124
5125 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
5126 if (!memcg->vmstats_percpu)
5127 goto fail;
5128
5129 for_each_node(node)
5130 if (alloc_mem_cgroup_per_node_info(memcg, node))
5131 goto fail;
5132
5133 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5134 goto fail;
5135
5136 INIT_WORK(&memcg->high_work, high_work_func);
5137 memcg->last_scanned_node = MAX_NUMNODES;
5138 INIT_LIST_HEAD(&memcg->oom_notify);
5139 mutex_init(&memcg->thresholds_lock);
5140 spin_lock_init(&memcg->move_lock);
5141 vmpressure_init(&memcg->vmpressure);
5142 INIT_LIST_HEAD(&memcg->event_list);
5143 spin_lock_init(&memcg->event_list_lock);
5144 memcg->socket_pressure = jiffies;
5145 #ifdef CONFIG_MEMCG_KMEM
5146 memcg->kmemcg_id = -1;
5147 #endif
5148 #ifdef CONFIG_CGROUP_WRITEBACK
5149 INIT_LIST_HEAD(&memcg->cgwb_list);
5150 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5151 memcg->cgwb_frn[i].done =
5152 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5153 #endif
5154 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5155 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5156 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5157 memcg->deferred_split_queue.split_queue_len = 0;
5158 #endif
5159 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5160 return memcg;
5161 fail:
5162 mem_cgroup_id_remove(memcg);
5163 __mem_cgroup_free(memcg);
5164 return ERR_PTR(error);
5165 }
5166
5167 static struct cgroup_subsys_state * __ref
5168 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5169 {
5170 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5171 struct mem_cgroup *memcg;
5172 long error = -ENOMEM;
5173
5174 memcg = mem_cgroup_alloc();
5175 if (IS_ERR(memcg))
5176 return ERR_CAST(memcg);
5177
5178 memcg->high = PAGE_COUNTER_MAX;
5179 memcg->soft_limit = PAGE_COUNTER_MAX;
5180 if (parent) {
5181 memcg->swappiness = mem_cgroup_swappiness(parent);
5182 memcg->oom_kill_disable = parent->oom_kill_disable;
5183 }
5184 if (parent && parent->use_hierarchy) {
5185 memcg->use_hierarchy = true;
5186 page_counter_init(&memcg->memory, &parent->memory);
5187 page_counter_init(&memcg->swap, &parent->swap);
5188 page_counter_init(&memcg->memsw, &parent->memsw);
5189 page_counter_init(&memcg->kmem, &parent->kmem);
5190 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5191 } else {
5192 page_counter_init(&memcg->memory, NULL);
5193 page_counter_init(&memcg->swap, NULL);
5194 page_counter_init(&memcg->memsw, NULL);
5195 page_counter_init(&memcg->kmem, NULL);
5196 page_counter_init(&memcg->tcpmem, NULL);
5197
5198
5199
5200
5201
5202 if (parent != root_mem_cgroup)
5203 memory_cgrp_subsys.broken_hierarchy = true;
5204 }
5205
5206
5207 if (!parent) {
5208 #ifdef CONFIG_MEMCG_KMEM
5209 INIT_LIST_HEAD(&memcg->kmem_caches);
5210 #endif
5211 root_mem_cgroup = memcg;
5212 return &memcg->css;
5213 }
5214
5215 error = memcg_online_kmem(memcg);
5216 if (error)
5217 goto fail;
5218
5219 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5220 static_branch_inc(&memcg_sockets_enabled_key);
5221
5222 return &memcg->css;
5223 fail:
5224 mem_cgroup_id_remove(memcg);
5225 mem_cgroup_free(memcg);
5226 return ERR_PTR(error);
5227 }
5228
5229 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5230 {
5231 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5232
5233
5234
5235
5236
5237
5238 if (memcg_alloc_shrinker_maps(memcg)) {
5239 mem_cgroup_id_remove(memcg);
5240 return -ENOMEM;
5241 }
5242
5243
5244 refcount_set(&memcg->id.ref, 1);
5245 css_get(css);
5246 return 0;
5247 }
5248
5249 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5250 {
5251 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5252 struct mem_cgroup_event *event, *tmp;
5253
5254
5255
5256
5257
5258
5259 spin_lock(&memcg->event_list_lock);
5260 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5261 list_del_init(&event->list);
5262 schedule_work(&event->remove);
5263 }
5264 spin_unlock(&memcg->event_list_lock);
5265
5266 page_counter_set_min(&memcg->memory, 0);
5267 page_counter_set_low(&memcg->memory, 0);
5268
5269 memcg_offline_kmem(memcg);
5270 wb_memcg_offline(memcg);
5271
5272 drain_all_stock(memcg);
5273
5274 mem_cgroup_id_put(memcg);
5275 }
5276
5277 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5278 {
5279 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5280
5281 invalidate_reclaim_iterators(memcg);
5282 }
5283
5284 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5285 {
5286 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5287 int __maybe_unused i;
5288
5289 #ifdef CONFIG_CGROUP_WRITEBACK
5290 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5291 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5292 #endif
5293 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5294 static_branch_dec(&memcg_sockets_enabled_key);
5295
5296 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5297 static_branch_dec(&memcg_sockets_enabled_key);
5298
5299 vmpressure_cleanup(&memcg->vmpressure);
5300 cancel_work_sync(&memcg->high_work);
5301 mem_cgroup_remove_from_trees(memcg);
5302 memcg_free_shrinker_maps(memcg);
5303 memcg_free_kmem(memcg);
5304 mem_cgroup_free(memcg);
5305 }
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5321 {
5322 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5323
5324 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5325 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5326 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5327 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5328 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5329 page_counter_set_min(&memcg->memory, 0);
5330 page_counter_set_low(&memcg->memory, 0);
5331 memcg->high = PAGE_COUNTER_MAX;
5332 memcg->soft_limit = PAGE_COUNTER_MAX;
5333 memcg_wb_domain_size_changed(memcg);
5334 }
5335
5336 #ifdef CONFIG_MMU
5337
5338 static int mem_cgroup_do_precharge(unsigned long count)
5339 {
5340 int ret;
5341
5342
5343 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5344 if (!ret) {
5345 mc.precharge += count;
5346 return ret;
5347 }
5348
5349
5350 while (count--) {
5351 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5352 if (ret)
5353 return ret;
5354 mc.precharge++;
5355 cond_resched();
5356 }
5357 return 0;
5358 }
5359
5360 union mc_target {
5361 struct page *page;
5362 swp_entry_t ent;
5363 };
5364
5365 enum mc_target_type {
5366 MC_TARGET_NONE = 0,
5367 MC_TARGET_PAGE,
5368 MC_TARGET_SWAP,
5369 MC_TARGET_DEVICE,
5370 };
5371
5372 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5373 unsigned long addr, pte_t ptent)
5374 {
5375 struct page *page = vm_normal_page(vma, addr, ptent);
5376
5377 if (!page || !page_mapped(page))
5378 return NULL;
5379 if (PageAnon(page)) {
5380 if (!(mc.flags & MOVE_ANON))
5381 return NULL;
5382 } else {
5383 if (!(mc.flags & MOVE_FILE))
5384 return NULL;
5385 }
5386 if (!get_page_unless_zero(page))
5387 return NULL;
5388
5389 return page;
5390 }
5391
5392 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5393 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5394 pte_t ptent, swp_entry_t *entry)
5395 {
5396 struct page *page = NULL;
5397 swp_entry_t ent = pte_to_swp_entry(ptent);
5398
5399 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5400 return NULL;
5401
5402
5403
5404
5405
5406
5407 if (is_device_private_entry(ent)) {
5408 page = device_private_entry_to_page(ent);
5409
5410
5411
5412
5413 if (!page_ref_add_unless(page, 1, 1))
5414 return NULL;
5415 return page;
5416 }
5417
5418
5419
5420
5421
5422 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5423 if (do_memsw_account())
5424 entry->val = ent.val;
5425
5426 return page;
5427 }
5428 #else
5429 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5430 pte_t ptent, swp_entry_t *entry)
5431 {
5432 return NULL;
5433 }
5434 #endif
5435
5436 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5437 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5438 {
5439 struct page *page = NULL;
5440 struct address_space *mapping;
5441 pgoff_t pgoff;
5442
5443 if (!vma->vm_file)
5444 return NULL;
5445 if (!(mc.flags & MOVE_FILE))
5446 return NULL;
5447
5448 mapping = vma->vm_file->f_mapping;
5449 pgoff = linear_page_index(vma, addr);
5450
5451
5452 #ifdef CONFIG_SWAP
5453
5454 if (shmem_mapping(mapping)) {
5455 page = find_get_entry(mapping, pgoff);
5456 if (xa_is_value(page)) {
5457 swp_entry_t swp = radix_to_swp_entry(page);
5458 if (do_memsw_account())
5459 *entry = swp;
5460 page = find_get_page(swap_address_space(swp),
5461 swp_offset(swp));
5462 }
5463 } else
5464 page = find_get_page(mapping, pgoff);
5465 #else
5466 page = find_get_page(mapping, pgoff);
5467 #endif
5468 return page;
5469 }
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483 static int mem_cgroup_move_account(struct page *page,
5484 bool compound,
5485 struct mem_cgroup *from,
5486 struct mem_cgroup *to)
5487 {
5488 struct lruvec *from_vec, *to_vec;
5489 struct pglist_data *pgdat;
5490 unsigned long flags;
5491 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5492 int ret;
5493 bool anon;
5494
5495 VM_BUG_ON(from == to);
5496 VM_BUG_ON_PAGE(PageLRU(page), page);
5497 VM_BUG_ON(compound && !PageTransHuge(page));
5498
5499
5500
5501
5502
5503 ret = -EBUSY;
5504 if (!trylock_page(page))
5505 goto out;
5506
5507 ret = -EINVAL;
5508 if (page->mem_cgroup != from)
5509 goto out_unlock;
5510
5511 anon = PageAnon(page);
5512
5513 pgdat = page_pgdat(page);
5514 from_vec = mem_cgroup_lruvec(pgdat, from);
5515 to_vec = mem_cgroup_lruvec(pgdat, to);
5516
5517 spin_lock_irqsave(&from->move_lock, flags);
5518
5519 if (!anon && page_mapped(page)) {
5520 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5521 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5522 }
5523
5524
5525
5526
5527
5528
5529 if (!anon && PageDirty(page)) {
5530 struct address_space *mapping = page_mapping(page);
5531
5532 if (mapping_cap_account_dirty(mapping)) {
5533 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
5534 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
5535 }
5536 }
5537
5538 if (PageWriteback(page)) {
5539 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5540 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5541 }
5542
5543
5544
5545
5546
5547
5548
5549
5550 page->mem_cgroup = to;
5551
5552 spin_unlock_irqrestore(&from->move_lock, flags);
5553
5554 ret = 0;
5555
5556 local_irq_disable();
5557 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5558 memcg_check_events(to, page);
5559 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5560 memcg_check_events(from, page);
5561 local_irq_enable();
5562 out_unlock:
5563 unlock_page(page);
5564 out:
5565 return ret;
5566 }
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5595 unsigned long addr, pte_t ptent, union mc_target *target)
5596 {
5597 struct page *page = NULL;
5598 enum mc_target_type ret = MC_TARGET_NONE;
5599 swp_entry_t ent = { .val = 0 };
5600
5601 if (pte_present(ptent))
5602 page = mc_handle_present_pte(vma, addr, ptent);
5603 else if (is_swap_pte(ptent))
5604 page = mc_handle_swap_pte(vma, ptent, &ent);
5605 else if (pte_none(ptent))
5606 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5607
5608 if (!page && !ent.val)
5609 return ret;
5610 if (page) {
5611
5612
5613
5614
5615
5616 if (page->mem_cgroup == mc.from) {
5617 ret = MC_TARGET_PAGE;
5618 if (is_device_private_page(page))
5619 ret = MC_TARGET_DEVICE;
5620 if (target)
5621 target->page = page;
5622 }
5623 if (!ret || !target)
5624 put_page(page);
5625 }
5626
5627
5628
5629
5630 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5631 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5632 ret = MC_TARGET_SWAP;
5633 if (target)
5634 target->ent = ent;
5635 }
5636 return ret;
5637 }
5638
5639 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5640
5641
5642
5643
5644
5645 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5646 unsigned long addr, pmd_t pmd, union mc_target *target)
5647 {
5648 struct page *page = NULL;
5649 enum mc_target_type ret = MC_TARGET_NONE;
5650
5651 if (unlikely(is_swap_pmd(pmd))) {
5652 VM_BUG_ON(thp_migration_supported() &&
5653 !is_pmd_migration_entry(pmd));
5654 return ret;
5655 }
5656 page = pmd_page(pmd);
5657 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5658 if (!(mc.flags & MOVE_ANON))
5659 return ret;
5660 if (page->mem_cgroup == mc.from) {
5661 ret = MC_TARGET_PAGE;
5662 if (target) {
5663 get_page(page);
5664 target->page = page;
5665 }
5666 }
5667 return ret;
5668 }
5669 #else
5670 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5671 unsigned long addr, pmd_t pmd, union mc_target *target)
5672 {
5673 return MC_TARGET_NONE;
5674 }
5675 #endif
5676
5677 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5678 unsigned long addr, unsigned long end,
5679 struct mm_walk *walk)
5680 {
5681 struct vm_area_struct *vma = walk->vma;
5682 pte_t *pte;
5683 spinlock_t *ptl;
5684
5685 ptl = pmd_trans_huge_lock(pmd, vma);
5686 if (ptl) {
5687
5688
5689
5690
5691
5692 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5693 mc.precharge += HPAGE_PMD_NR;
5694 spin_unlock(ptl);
5695 return 0;
5696 }
5697
5698 if (pmd_trans_unstable(pmd))
5699 return 0;
5700 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5701 for (; addr != end; pte++, addr += PAGE_SIZE)
5702 if (get_mctgt_type(vma, addr, *pte, NULL))
5703 mc.precharge++;
5704 pte_unmap_unlock(pte - 1, ptl);
5705 cond_resched();
5706
5707 return 0;
5708 }
5709
5710 static const struct mm_walk_ops precharge_walk_ops = {
5711 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5712 };
5713
5714 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5715 {
5716 unsigned long precharge;
5717
5718 down_read(&mm->mmap_sem);
5719 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5720 up_read(&mm->mmap_sem);
5721
5722 precharge = mc.precharge;
5723 mc.precharge = 0;
5724
5725 return precharge;
5726 }
5727
5728 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5729 {
5730 unsigned long precharge = mem_cgroup_count_precharge(mm);
5731
5732 VM_BUG_ON(mc.moving_task);
5733 mc.moving_task = current;
5734 return mem_cgroup_do_precharge(precharge);
5735 }
5736
5737
5738 static void __mem_cgroup_clear_mc(void)
5739 {
5740 struct mem_cgroup *from = mc.from;
5741 struct mem_cgroup *to = mc.to;
5742
5743
5744 if (mc.precharge) {
5745 cancel_charge(mc.to, mc.precharge);
5746 mc.precharge = 0;
5747 }
5748
5749
5750
5751
5752 if (mc.moved_charge) {
5753 cancel_charge(mc.from, mc.moved_charge);
5754 mc.moved_charge = 0;
5755 }
5756
5757 if (mc.moved_swap) {
5758
5759 if (!mem_cgroup_is_root(mc.from))
5760 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5761
5762 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5763
5764
5765
5766
5767
5768 if (!mem_cgroup_is_root(mc.to))
5769 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5770
5771 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5772 css_put_many(&mc.to->css, mc.moved_swap);
5773
5774 mc.moved_swap = 0;
5775 }
5776 memcg_oom_recover(from);
5777 memcg_oom_recover(to);
5778 wake_up_all(&mc.waitq);
5779 }
5780
5781 static void mem_cgroup_clear_mc(void)
5782 {
5783 struct mm_struct *mm = mc.mm;
5784
5785
5786
5787
5788
5789 mc.moving_task = NULL;
5790 __mem_cgroup_clear_mc();
5791 spin_lock(&mc.lock);
5792 mc.from = NULL;
5793 mc.to = NULL;
5794 mc.mm = NULL;
5795 spin_unlock(&mc.lock);
5796
5797 mmput(mm);
5798 }
5799
5800 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5801 {
5802 struct cgroup_subsys_state *css;
5803 struct mem_cgroup *memcg = NULL;
5804 struct mem_cgroup *from;
5805 struct task_struct *leader, *p;
5806 struct mm_struct *mm;
5807 unsigned long move_flags;
5808 int ret = 0;
5809
5810
5811 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5812 return 0;
5813
5814
5815
5816
5817
5818
5819
5820 p = NULL;
5821 cgroup_taskset_for_each_leader(leader, css, tset) {
5822 WARN_ON_ONCE(p);
5823 p = leader;
5824 memcg = mem_cgroup_from_css(css);
5825 }
5826 if (!p)
5827 return 0;
5828
5829
5830
5831
5832
5833
5834 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5835 if (!move_flags)
5836 return 0;
5837
5838 from = mem_cgroup_from_task(p);
5839
5840 VM_BUG_ON(from == memcg);
5841
5842 mm = get_task_mm(p);
5843 if (!mm)
5844 return 0;
5845
5846 if (mm->owner == p) {
5847 VM_BUG_ON(mc.from);
5848 VM_BUG_ON(mc.to);
5849 VM_BUG_ON(mc.precharge);
5850 VM_BUG_ON(mc.moved_charge);
5851 VM_BUG_ON(mc.moved_swap);
5852
5853 spin_lock(&mc.lock);
5854 mc.mm = mm;
5855 mc.from = from;
5856 mc.to = memcg;
5857 mc.flags = move_flags;
5858 spin_unlock(&mc.lock);
5859
5860
5861 ret = mem_cgroup_precharge_mc(mm);
5862 if (ret)
5863 mem_cgroup_clear_mc();
5864 } else {
5865 mmput(mm);
5866 }
5867 return ret;
5868 }
5869
5870 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5871 {
5872 if (mc.to)
5873 mem_cgroup_clear_mc();
5874 }
5875
5876 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5877 unsigned long addr, unsigned long end,
5878 struct mm_walk *walk)
5879 {
5880 int ret = 0;
5881 struct vm_area_struct *vma = walk->vma;
5882 pte_t *pte;
5883 spinlock_t *ptl;
5884 enum mc_target_type target_type;
5885 union mc_target target;
5886 struct page *page;
5887
5888 ptl = pmd_trans_huge_lock(pmd, vma);
5889 if (ptl) {
5890 if (mc.precharge < HPAGE_PMD_NR) {
5891 spin_unlock(ptl);
5892 return 0;
5893 }
5894 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5895 if (target_type == MC_TARGET_PAGE) {
5896 page = target.page;
5897 if (!isolate_lru_page(page)) {
5898 if (!mem_cgroup_move_account(page, true,
5899 mc.from, mc.to)) {
5900 mc.precharge -= HPAGE_PMD_NR;
5901 mc.moved_charge += HPAGE_PMD_NR;
5902 }
5903 putback_lru_page(page);
5904 }
5905 put_page(page);
5906 } else if (target_type == MC_TARGET_DEVICE) {
5907 page = target.page;
5908 if (!mem_cgroup_move_account(page, true,
5909 mc.from, mc.to)) {
5910 mc.precharge -= HPAGE_PMD_NR;
5911 mc.moved_charge += HPAGE_PMD_NR;
5912 }
5913 put_page(page);
5914 }
5915 spin_unlock(ptl);
5916 return 0;
5917 }
5918
5919 if (pmd_trans_unstable(pmd))
5920 return 0;
5921 retry:
5922 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5923 for (; addr != end; addr += PAGE_SIZE) {
5924 pte_t ptent = *(pte++);
5925 bool device = false;
5926 swp_entry_t ent;
5927
5928 if (!mc.precharge)
5929 break;
5930
5931 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5932 case MC_TARGET_DEVICE:
5933 device = true;
5934
5935 case MC_TARGET_PAGE:
5936 page = target.page;
5937
5938
5939
5940
5941
5942
5943 if (PageTransCompound(page))
5944 goto put;
5945 if (!device && isolate_lru_page(page))
5946 goto put;
5947 if (!mem_cgroup_move_account(page, false,
5948 mc.from, mc.to)) {
5949 mc.precharge--;
5950
5951 mc.moved_charge++;
5952 }
5953 if (!device)
5954 putback_lru_page(page);
5955 put:
5956 put_page(page);
5957 break;
5958 case MC_TARGET_SWAP:
5959 ent = target.ent;
5960 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5961 mc.precharge--;
5962
5963 mc.moved_swap++;
5964 }
5965 break;
5966 default:
5967 break;
5968 }
5969 }
5970 pte_unmap_unlock(pte - 1, ptl);
5971 cond_resched();
5972
5973 if (addr != end) {
5974
5975
5976
5977
5978
5979
5980 ret = mem_cgroup_do_precharge(1);
5981 if (!ret)
5982 goto retry;
5983 }
5984
5985 return ret;
5986 }
5987
5988 static const struct mm_walk_ops charge_walk_ops = {
5989 .pmd_entry = mem_cgroup_move_charge_pte_range,
5990 };
5991
5992 static void mem_cgroup_move_charge(void)
5993 {
5994 lru_add_drain_all();
5995
5996
5997
5998
5999
6000 atomic_inc(&mc.from->moving_account);
6001 synchronize_rcu();
6002 retry:
6003 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
6004
6005
6006
6007
6008
6009
6010
6011 __mem_cgroup_clear_mc();
6012 cond_resched();
6013 goto retry;
6014 }
6015
6016
6017
6018
6019 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6020 NULL);
6021
6022 up_read(&mc.mm->mmap_sem);
6023 atomic_dec(&mc.from->moving_account);
6024 }
6025
6026 static void mem_cgroup_move_task(void)
6027 {
6028 if (mc.to) {
6029 mem_cgroup_move_charge();
6030 mem_cgroup_clear_mc();
6031 }
6032 }
6033 #else
6034 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6035 {
6036 return 0;
6037 }
6038 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6039 {
6040 }
6041 static void mem_cgroup_move_task(void)
6042 {
6043 }
6044 #endif
6045
6046
6047
6048
6049
6050
6051 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6052 {
6053
6054
6055
6056
6057
6058 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6059 root_mem_cgroup->use_hierarchy = true;
6060 else
6061 root_mem_cgroup->use_hierarchy = false;
6062 }
6063
6064 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6065 {
6066 if (value == PAGE_COUNTER_MAX)
6067 seq_puts(m, "max\n");
6068 else
6069 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6070
6071 return 0;
6072 }
6073
6074 static u64 memory_current_read(struct cgroup_subsys_state *css,
6075 struct cftype *cft)
6076 {
6077 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6078
6079 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6080 }
6081
6082 static int memory_min_show(struct seq_file *m, void *v)
6083 {
6084 return seq_puts_memcg_tunable(m,
6085 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6086 }
6087
6088 static ssize_t memory_min_write(struct kernfs_open_file *of,
6089 char *buf, size_t nbytes, loff_t off)
6090 {
6091 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6092 unsigned long min;
6093 int err;
6094
6095 buf = strstrip(buf);
6096 err = page_counter_memparse(buf, "max", &min);
6097 if (err)
6098 return err;
6099
6100 page_counter_set_min(&memcg->memory, min);
6101
6102 return nbytes;
6103 }
6104
6105 static int memory_low_show(struct seq_file *m, void *v)
6106 {
6107 return seq_puts_memcg_tunable(m,
6108 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6109 }
6110
6111 static ssize_t memory_low_write(struct kernfs_open_file *of,
6112 char *buf, size_t nbytes, loff_t off)
6113 {
6114 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6115 unsigned long low;
6116 int err;
6117
6118 buf = strstrip(buf);
6119 err = page_counter_memparse(buf, "max", &low);
6120 if (err)
6121 return err;
6122
6123 page_counter_set_low(&memcg->memory, low);
6124
6125 return nbytes;
6126 }
6127
6128 static int memory_high_show(struct seq_file *m, void *v)
6129 {
6130 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
6131 }
6132
6133 static ssize_t memory_high_write(struct kernfs_open_file *of,
6134 char *buf, size_t nbytes, loff_t off)
6135 {
6136 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6137 unsigned long nr_pages;
6138 unsigned long high;
6139 int err;
6140
6141 buf = strstrip(buf);
6142 err = page_counter_memparse(buf, "max", &high);
6143 if (err)
6144 return err;
6145
6146 memcg->high = high;
6147
6148 nr_pages = page_counter_read(&memcg->memory);
6149 if (nr_pages > high)
6150 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6151 GFP_KERNEL, true);
6152
6153 memcg_wb_domain_size_changed(memcg);
6154 return nbytes;
6155 }
6156
6157 static int memory_max_show(struct seq_file *m, void *v)
6158 {
6159 return seq_puts_memcg_tunable(m,
6160 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6161 }
6162
6163 static ssize_t memory_max_write(struct kernfs_open_file *of,
6164 char *buf, size_t nbytes, loff_t off)
6165 {
6166 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6167 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6168 bool drained = false;
6169 unsigned long max;
6170 int err;
6171
6172 buf = strstrip(buf);
6173 err = page_counter_memparse(buf, "max", &max);
6174 if (err)
6175 return err;
6176
6177 xchg(&memcg->memory.max, max);
6178
6179 for (;;) {
6180 unsigned long nr_pages = page_counter_read(&memcg->memory);
6181
6182 if (nr_pages <= max)
6183 break;
6184
6185 if (signal_pending(current)) {
6186 err = -EINTR;
6187 break;
6188 }
6189
6190 if (!drained) {
6191 drain_all_stock(memcg);
6192 drained = true;
6193 continue;
6194 }
6195
6196 if (nr_reclaims) {
6197 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6198 GFP_KERNEL, true))
6199 nr_reclaims--;
6200 continue;
6201 }
6202
6203 memcg_memory_event(memcg, MEMCG_OOM);
6204 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6205 break;
6206 }
6207
6208 memcg_wb_domain_size_changed(memcg);
6209 return nbytes;
6210 }
6211
6212 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6213 {
6214 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6215 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6216 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6217 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6218 seq_printf(m, "oom_kill %lu\n",
6219 atomic_long_read(&events[MEMCG_OOM_KILL]));
6220 }
6221
6222 static int memory_events_show(struct seq_file *m, void *v)
6223 {
6224 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6225
6226 __memory_events_show(m, memcg->memory_events);
6227 return 0;
6228 }
6229
6230 static int memory_events_local_show(struct seq_file *m, void *v)
6231 {
6232 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6233
6234 __memory_events_show(m, memcg->memory_events_local);
6235 return 0;
6236 }
6237
6238 static int memory_stat_show(struct seq_file *m, void *v)
6239 {
6240 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6241 char *buf;
6242
6243 buf = memory_stat_format(memcg);
6244 if (!buf)
6245 return -ENOMEM;
6246 seq_puts(m, buf);
6247 kfree(buf);
6248 return 0;
6249 }
6250
6251 static int memory_oom_group_show(struct seq_file *m, void *v)
6252 {
6253 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6254
6255 seq_printf(m, "%d\n", memcg->oom_group);
6256
6257 return 0;
6258 }
6259
6260 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6261 char *buf, size_t nbytes, loff_t off)
6262 {
6263 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6264 int ret, oom_group;
6265
6266 buf = strstrip(buf);
6267 if (!buf)
6268 return -EINVAL;
6269
6270 ret = kstrtoint(buf, 0, &oom_group);
6271 if (ret)
6272 return ret;
6273
6274 if (oom_group != 0 && oom_group != 1)
6275 return -EINVAL;
6276
6277 memcg->oom_group = oom_group;
6278
6279 return nbytes;
6280 }
6281
6282 static struct cftype memory_files[] = {
6283 {
6284 .name = "current",
6285 .flags = CFTYPE_NOT_ON_ROOT,
6286 .read_u64 = memory_current_read,
6287 },
6288 {
6289 .name = "min",
6290 .flags = CFTYPE_NOT_ON_ROOT,
6291 .seq_show = memory_min_show,
6292 .write = memory_min_write,
6293 },
6294 {
6295 .name = "low",
6296 .flags = CFTYPE_NOT_ON_ROOT,
6297 .seq_show = memory_low_show,
6298 .write = memory_low_write,
6299 },
6300 {
6301 .name = "high",
6302 .flags = CFTYPE_NOT_ON_ROOT,
6303 .seq_show = memory_high_show,
6304 .write = memory_high_write,
6305 },
6306 {
6307 .name = "max",
6308 .flags = CFTYPE_NOT_ON_ROOT,
6309 .seq_show = memory_max_show,
6310 .write = memory_max_write,
6311 },
6312 {
6313 .name = "events",
6314 .flags = CFTYPE_NOT_ON_ROOT,
6315 .file_offset = offsetof(struct mem_cgroup, events_file),
6316 .seq_show = memory_events_show,
6317 },
6318 {
6319 .name = "events.local",
6320 .flags = CFTYPE_NOT_ON_ROOT,
6321 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6322 .seq_show = memory_events_local_show,
6323 },
6324 {
6325 .name = "stat",
6326 .flags = CFTYPE_NOT_ON_ROOT,
6327 .seq_show = memory_stat_show,
6328 },
6329 {
6330 .name = "oom.group",
6331 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6332 .seq_show = memory_oom_group_show,
6333 .write = memory_oom_group_write,
6334 },
6335 { }
6336 };
6337
6338 struct cgroup_subsys memory_cgrp_subsys = {
6339 .css_alloc = mem_cgroup_css_alloc,
6340 .css_online = mem_cgroup_css_online,
6341 .css_offline = mem_cgroup_css_offline,
6342 .css_released = mem_cgroup_css_released,
6343 .css_free = mem_cgroup_css_free,
6344 .css_reset = mem_cgroup_css_reset,
6345 .can_attach = mem_cgroup_can_attach,
6346 .cancel_attach = mem_cgroup_cancel_attach,
6347 .post_attach = mem_cgroup_move_task,
6348 .bind = mem_cgroup_bind,
6349 .dfl_cftypes = memory_files,
6350 .legacy_cftypes = mem_cgroup_legacy_files,
6351 .early_init = 0,
6352 };
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6425 struct mem_cgroup *memcg)
6426 {
6427 struct mem_cgroup *parent;
6428 unsigned long emin, parent_emin;
6429 unsigned long elow, parent_elow;
6430 unsigned long usage;
6431
6432 if (mem_cgroup_disabled())
6433 return MEMCG_PROT_NONE;
6434
6435 if (!root)
6436 root = root_mem_cgroup;
6437 if (memcg == root)
6438 return MEMCG_PROT_NONE;
6439
6440 usage = page_counter_read(&memcg->memory);
6441 if (!usage)
6442 return MEMCG_PROT_NONE;
6443
6444 emin = memcg->memory.min;
6445 elow = memcg->memory.low;
6446
6447 parent = parent_mem_cgroup(memcg);
6448
6449 if (!parent)
6450 return MEMCG_PROT_NONE;
6451
6452 if (parent == root)
6453 goto exit;
6454
6455 parent_emin = READ_ONCE(parent->memory.emin);
6456 emin = min(emin, parent_emin);
6457 if (emin && parent_emin) {
6458 unsigned long min_usage, siblings_min_usage;
6459
6460 min_usage = min(usage, memcg->memory.min);
6461 siblings_min_usage = atomic_long_read(
6462 &parent->memory.children_min_usage);
6463
6464 if (min_usage && siblings_min_usage)
6465 emin = min(emin, parent_emin * min_usage /
6466 siblings_min_usage);
6467 }
6468
6469 parent_elow = READ_ONCE(parent->memory.elow);
6470 elow = min(elow, parent_elow);
6471 if (elow && parent_elow) {
6472 unsigned long low_usage, siblings_low_usage;
6473
6474 low_usage = min(usage, memcg->memory.low);
6475 siblings_low_usage = atomic_long_read(
6476 &parent->memory.children_low_usage);
6477
6478 if (low_usage && siblings_low_usage)
6479 elow = min(elow, parent_elow * low_usage /
6480 siblings_low_usage);
6481 }
6482
6483 exit:
6484 memcg->memory.emin = emin;
6485 memcg->memory.elow = elow;
6486
6487 if (usage <= emin)
6488 return MEMCG_PROT_MIN;
6489 else if (usage <= elow)
6490 return MEMCG_PROT_LOW;
6491 else
6492 return MEMCG_PROT_NONE;
6493 }
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6514 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6515 bool compound)
6516 {
6517 struct mem_cgroup *memcg = NULL;
6518 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6519 int ret = 0;
6520
6521 if (mem_cgroup_disabled())
6522 goto out;
6523
6524 if (PageSwapCache(page)) {
6525
6526
6527
6528
6529
6530
6531
6532 VM_BUG_ON_PAGE(!PageLocked(page), page);
6533 if (compound_head(page)->mem_cgroup)
6534 goto out;
6535
6536 if (do_swap_account) {
6537 swp_entry_t ent = { .val = page_private(page), };
6538 unsigned short id = lookup_swap_cgroup_id(ent);
6539
6540 rcu_read_lock();
6541 memcg = mem_cgroup_from_id(id);
6542 if (memcg && !css_tryget_online(&memcg->css))
6543 memcg = NULL;
6544 rcu_read_unlock();
6545 }
6546 }
6547
6548 if (!memcg)
6549 memcg = get_mem_cgroup_from_mm(mm);
6550
6551 ret = try_charge(memcg, gfp_mask, nr_pages);
6552
6553 css_put(&memcg->css);
6554 out:
6555 *memcgp = memcg;
6556 return ret;
6557 }
6558
6559 int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6560 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6561 bool compound)
6562 {
6563 struct mem_cgroup *memcg;
6564 int ret;
6565
6566 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6567 memcg = *memcgp;
6568 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6569 return ret;
6570 }
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6590 bool lrucare, bool compound)
6591 {
6592 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6593
6594 VM_BUG_ON_PAGE(!page->mapping, page);
6595 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6596
6597 if (mem_cgroup_disabled())
6598 return;
6599
6600
6601
6602
6603
6604 if (!memcg)
6605 return;
6606
6607 commit_charge(page, memcg, lrucare);
6608
6609 local_irq_disable();
6610 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6611 memcg_check_events(memcg, page);
6612 local_irq_enable();
6613
6614 if (do_memsw_account() && PageSwapCache(page)) {
6615 swp_entry_t entry = { .val = page_private(page) };
6616
6617
6618
6619
6620
6621 mem_cgroup_uncharge_swap(entry, nr_pages);
6622 }
6623 }
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6634 bool compound)
6635 {
6636 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6637
6638 if (mem_cgroup_disabled())
6639 return;
6640
6641
6642
6643
6644
6645 if (!memcg)
6646 return;
6647
6648 cancel_charge(memcg, nr_pages);
6649 }
6650
6651 struct uncharge_gather {
6652 struct mem_cgroup *memcg;
6653 unsigned long pgpgout;
6654 unsigned long nr_anon;
6655 unsigned long nr_file;
6656 unsigned long nr_kmem;
6657 unsigned long nr_huge;
6658 unsigned long nr_shmem;
6659 struct page *dummy_page;
6660 };
6661
6662 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6663 {
6664 memset(ug, 0, sizeof(*ug));
6665 }
6666
6667 static void uncharge_batch(const struct uncharge_gather *ug)
6668 {
6669 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6670 unsigned long flags;
6671
6672 if (!mem_cgroup_is_root(ug->memcg)) {
6673 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6674 if (do_memsw_account())
6675 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6676 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6677 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6678 memcg_oom_recover(ug->memcg);
6679 }
6680
6681 local_irq_save(flags);
6682 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6683 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6684 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6685 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6686 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6687 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6688 memcg_check_events(ug->memcg, ug->dummy_page);
6689 local_irq_restore(flags);
6690
6691 if (!mem_cgroup_is_root(ug->memcg))
6692 css_put_many(&ug->memcg->css, nr_pages);
6693 }
6694
6695 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6696 {
6697 VM_BUG_ON_PAGE(PageLRU(page), page);
6698 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6699 !PageHWPoison(page) , page);
6700
6701 if (!page->mem_cgroup)
6702 return;
6703
6704
6705
6706
6707
6708
6709
6710 if (ug->memcg != page->mem_cgroup) {
6711 if (ug->memcg) {
6712 uncharge_batch(ug);
6713 uncharge_gather_clear(ug);
6714 }
6715 ug->memcg = page->mem_cgroup;
6716 }
6717
6718 if (!PageKmemcg(page)) {
6719 unsigned int nr_pages = 1;
6720
6721 if (PageTransHuge(page)) {
6722 nr_pages = compound_nr(page);
6723 ug->nr_huge += nr_pages;
6724 }
6725 if (PageAnon(page))
6726 ug->nr_anon += nr_pages;
6727 else {
6728 ug->nr_file += nr_pages;
6729 if (PageSwapBacked(page))
6730 ug->nr_shmem += nr_pages;
6731 }
6732 ug->pgpgout++;
6733 } else {
6734 ug->nr_kmem += compound_nr(page);
6735 __ClearPageKmemcg(page);
6736 }
6737
6738 ug->dummy_page = page;
6739 page->mem_cgroup = NULL;
6740 }
6741
6742 static void uncharge_list(struct list_head *page_list)
6743 {
6744 struct uncharge_gather ug;
6745 struct list_head *next;
6746
6747 uncharge_gather_clear(&ug);
6748
6749
6750
6751
6752
6753 next = page_list->next;
6754 do {
6755 struct page *page;
6756
6757 page = list_entry(next, struct page, lru);
6758 next = page->lru.next;
6759
6760 uncharge_page(page, &ug);
6761 } while (next != page_list);
6762
6763 if (ug.memcg)
6764 uncharge_batch(&ug);
6765 }
6766
6767
6768
6769
6770
6771
6772
6773
6774 void mem_cgroup_uncharge(struct page *page)
6775 {
6776 struct uncharge_gather ug;
6777
6778 if (mem_cgroup_disabled())
6779 return;
6780
6781
6782 if (!page->mem_cgroup)
6783 return;
6784
6785 uncharge_gather_clear(&ug);
6786 uncharge_page(page, &ug);
6787 uncharge_batch(&ug);
6788 }
6789
6790
6791
6792
6793
6794
6795
6796
6797 void mem_cgroup_uncharge_list(struct list_head *page_list)
6798 {
6799 if (mem_cgroup_disabled())
6800 return;
6801
6802 if (!list_empty(page_list))
6803 uncharge_list(page_list);
6804 }
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6817 {
6818 struct mem_cgroup *memcg;
6819 unsigned int nr_pages;
6820 bool compound;
6821 unsigned long flags;
6822
6823 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6824 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6825 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6826 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6827 newpage);
6828
6829 if (mem_cgroup_disabled())
6830 return;
6831
6832
6833 if (newpage->mem_cgroup)
6834 return;
6835
6836
6837 memcg = oldpage->mem_cgroup;
6838 if (!memcg)
6839 return;
6840
6841
6842 compound = PageTransHuge(newpage);
6843 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6844
6845 page_counter_charge(&memcg->memory, nr_pages);
6846 if (do_memsw_account())
6847 page_counter_charge(&memcg->memsw, nr_pages);
6848 css_get_many(&memcg->css, nr_pages);
6849
6850 commit_charge(newpage, memcg, false);
6851
6852 local_irq_save(flags);
6853 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6854 memcg_check_events(memcg, newpage);
6855 local_irq_restore(flags);
6856 }
6857
6858 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6859 EXPORT_SYMBOL(memcg_sockets_enabled_key);
6860
6861 void mem_cgroup_sk_alloc(struct sock *sk)
6862 {
6863 struct mem_cgroup *memcg;
6864
6865 if (!mem_cgroup_sockets_enabled)
6866 return;
6867
6868
6869 if (in_interrupt())
6870 return;
6871
6872 rcu_read_lock();
6873 memcg = mem_cgroup_from_task(current);
6874 if (memcg == root_mem_cgroup)
6875 goto out;
6876 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6877 goto out;
6878 if (css_tryget_online(&memcg->css))
6879 sk->sk_memcg = memcg;
6880 out:
6881 rcu_read_unlock();
6882 }
6883
6884 void mem_cgroup_sk_free(struct sock *sk)
6885 {
6886 if (sk->sk_memcg)
6887 css_put(&sk->sk_memcg->css);
6888 }
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6899 {
6900 gfp_t gfp_mask = GFP_KERNEL;
6901
6902 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6903 struct page_counter *fail;
6904
6905 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6906 memcg->tcpmem_pressure = 0;
6907 return true;
6908 }
6909 page_counter_charge(&memcg->tcpmem, nr_pages);
6910 memcg->tcpmem_pressure = 1;
6911 return false;
6912 }
6913
6914
6915 if (in_softirq())
6916 gfp_mask = GFP_NOWAIT;
6917
6918 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6919
6920 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6921 return true;
6922
6923 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6924 return false;
6925 }
6926
6927
6928
6929
6930
6931
6932 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6933 {
6934 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6935 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6936 return;
6937 }
6938
6939 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6940
6941 refill_stock(memcg, nr_pages);
6942 }
6943
6944 static int __init cgroup_memory(char *s)
6945 {
6946 char *token;
6947
6948 while ((token = strsep(&s, ",")) != NULL) {
6949 if (!*token)
6950 continue;
6951 if (!strcmp(token, "nosocket"))
6952 cgroup_memory_nosocket = true;
6953 if (!strcmp(token, "nokmem"))
6954 cgroup_memory_nokmem = true;
6955 }
6956 return 0;
6957 }
6958 __setup("cgroup.memory=", cgroup_memory);
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968 static int __init mem_cgroup_init(void)
6969 {
6970 int cpu, node;
6971
6972 #ifdef CONFIG_MEMCG_KMEM
6973
6974
6975
6976
6977
6978
6979 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6980 BUG_ON(!memcg_kmem_cache_wq);
6981 #endif
6982
6983 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6984 memcg_hotplug_cpu_dead);
6985
6986 for_each_possible_cpu(cpu)
6987 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6988 drain_local_stock);
6989
6990 for_each_node(node) {
6991 struct mem_cgroup_tree_per_node *rtpn;
6992
6993 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6994 node_online(node) ? node : NUMA_NO_NODE);
6995
6996 rtpn->rb_root = RB_ROOT;
6997 rtpn->rb_rightmost = NULL;
6998 spin_lock_init(&rtpn->lock);
6999 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7000 }
7001
7002 return 0;
7003 }
7004 subsys_initcall(mem_cgroup_init);
7005
7006 #ifdef CONFIG_MEMCG_SWAP
7007 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7008 {
7009 while (!refcount_inc_not_zero(&memcg->id.ref)) {
7010
7011
7012
7013
7014 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7015 VM_BUG_ON(1);
7016 break;
7017 }
7018 memcg = parent_mem_cgroup(memcg);
7019 if (!memcg)
7020 memcg = root_mem_cgroup;
7021 }
7022 return memcg;
7023 }
7024
7025
7026
7027
7028
7029
7030
7031
7032 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7033 {
7034 struct mem_cgroup *memcg, *swap_memcg;
7035 unsigned int nr_entries;
7036 unsigned short oldid;
7037
7038 VM_BUG_ON_PAGE(PageLRU(page), page);
7039 VM_BUG_ON_PAGE(page_count(page), page);
7040
7041 if (!do_memsw_account())
7042 return;
7043
7044 memcg = page->mem_cgroup;
7045
7046
7047 if (!memcg)
7048 return;
7049
7050
7051
7052
7053
7054
7055 swap_memcg = mem_cgroup_id_get_online(memcg);
7056 nr_entries = hpage_nr_pages(page);
7057
7058 if (nr_entries > 1)
7059 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7060 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7061 nr_entries);
7062 VM_BUG_ON_PAGE(oldid, page);
7063 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7064
7065 page->mem_cgroup = NULL;
7066
7067 if (!mem_cgroup_is_root(memcg))
7068 page_counter_uncharge(&memcg->memory, nr_entries);
7069
7070 if (memcg != swap_memcg) {
7071 if (!mem_cgroup_is_root(swap_memcg))
7072 page_counter_charge(&swap_memcg->memsw, nr_entries);
7073 page_counter_uncharge(&memcg->memsw, nr_entries);
7074 }
7075
7076
7077
7078
7079
7080
7081
7082 VM_BUG_ON(!irqs_disabled());
7083 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
7084 -nr_entries);
7085 memcg_check_events(memcg, page);
7086
7087 if (!mem_cgroup_is_root(memcg))
7088 css_put_many(&memcg->css, nr_entries);
7089 }
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7101 {
7102 unsigned int nr_pages = hpage_nr_pages(page);
7103 struct page_counter *counter;
7104 struct mem_cgroup *memcg;
7105 unsigned short oldid;
7106
7107 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7108 return 0;
7109
7110 memcg = page->mem_cgroup;
7111
7112
7113 if (!memcg)
7114 return 0;
7115
7116 if (!entry.val) {
7117 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7118 return 0;
7119 }
7120
7121 memcg = mem_cgroup_id_get_online(memcg);
7122
7123 if (!mem_cgroup_is_root(memcg) &&
7124 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7125 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7126 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7127 mem_cgroup_id_put(memcg);
7128 return -ENOMEM;
7129 }
7130
7131
7132 if (nr_pages > 1)
7133 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7134 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7135 VM_BUG_ON_PAGE(oldid, page);
7136 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7137
7138 return 0;
7139 }
7140
7141
7142
7143
7144
7145
7146 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7147 {
7148 struct mem_cgroup *memcg;
7149 unsigned short id;
7150
7151 if (!do_swap_account)
7152 return;
7153
7154 id = swap_cgroup_record(entry, 0, nr_pages);
7155 rcu_read_lock();
7156 memcg = mem_cgroup_from_id(id);
7157 if (memcg) {
7158 if (!mem_cgroup_is_root(memcg)) {
7159 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7160 page_counter_uncharge(&memcg->swap, nr_pages);
7161 else
7162 page_counter_uncharge(&memcg->memsw, nr_pages);
7163 }
7164 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7165 mem_cgroup_id_put_many(memcg, nr_pages);
7166 }
7167 rcu_read_unlock();
7168 }
7169
7170 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7171 {
7172 long nr_swap_pages = get_nr_swap_pages();
7173
7174 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7175 return nr_swap_pages;
7176 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7177 nr_swap_pages = min_t(long, nr_swap_pages,
7178 READ_ONCE(memcg->swap.max) -
7179 page_counter_read(&memcg->swap));
7180 return nr_swap_pages;
7181 }
7182
7183 bool mem_cgroup_swap_full(struct page *page)
7184 {
7185 struct mem_cgroup *memcg;
7186
7187 VM_BUG_ON_PAGE(!PageLocked(page), page);
7188
7189 if (vm_swap_full())
7190 return true;
7191 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7192 return false;
7193
7194 memcg = page->mem_cgroup;
7195 if (!memcg)
7196 return false;
7197
7198 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7199 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7200 return true;
7201
7202 return false;
7203 }
7204
7205
7206 #ifdef CONFIG_MEMCG_SWAP_ENABLED
7207 static int really_do_swap_account __initdata = 1;
7208 #else
7209 static int really_do_swap_account __initdata;
7210 #endif
7211
7212 static int __init enable_swap_account(char *s)
7213 {
7214 if (!strcmp(s, "1"))
7215 really_do_swap_account = 1;
7216 else if (!strcmp(s, "0"))
7217 really_do_swap_account = 0;
7218 return 1;
7219 }
7220 __setup("swapaccount=", enable_swap_account);
7221
7222 static u64 swap_current_read(struct cgroup_subsys_state *css,
7223 struct cftype *cft)
7224 {
7225 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7226
7227 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7228 }
7229
7230 static int swap_max_show(struct seq_file *m, void *v)
7231 {
7232 return seq_puts_memcg_tunable(m,
7233 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7234 }
7235
7236 static ssize_t swap_max_write(struct kernfs_open_file *of,
7237 char *buf, size_t nbytes, loff_t off)
7238 {
7239 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7240 unsigned long max;
7241 int err;
7242
7243 buf = strstrip(buf);
7244 err = page_counter_memparse(buf, "max", &max);
7245 if (err)
7246 return err;
7247
7248 xchg(&memcg->swap.max, max);
7249
7250 return nbytes;
7251 }
7252
7253 static int swap_events_show(struct seq_file *m, void *v)
7254 {
7255 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7256
7257 seq_printf(m, "max %lu\n",
7258 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7259 seq_printf(m, "fail %lu\n",
7260 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7261
7262 return 0;
7263 }
7264
7265 static struct cftype swap_files[] = {
7266 {
7267 .name = "swap.current",
7268 .flags = CFTYPE_NOT_ON_ROOT,
7269 .read_u64 = swap_current_read,
7270 },
7271 {
7272 .name = "swap.max",
7273 .flags = CFTYPE_NOT_ON_ROOT,
7274 .seq_show = swap_max_show,
7275 .write = swap_max_write,
7276 },
7277 {
7278 .name = "swap.events",
7279 .flags = CFTYPE_NOT_ON_ROOT,
7280 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7281 .seq_show = swap_events_show,
7282 },
7283 { }
7284 };
7285
7286 static struct cftype memsw_cgroup_files[] = {
7287 {
7288 .name = "memsw.usage_in_bytes",
7289 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7290 .read_u64 = mem_cgroup_read_u64,
7291 },
7292 {
7293 .name = "memsw.max_usage_in_bytes",
7294 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7295 .write = mem_cgroup_reset,
7296 .read_u64 = mem_cgroup_read_u64,
7297 },
7298 {
7299 .name = "memsw.limit_in_bytes",
7300 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7301 .write = mem_cgroup_write,
7302 .read_u64 = mem_cgroup_read_u64,
7303 },
7304 {
7305 .name = "memsw.failcnt",
7306 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7307 .write = mem_cgroup_reset,
7308 .read_u64 = mem_cgroup_read_u64,
7309 },
7310 { },
7311 };
7312
7313 static int __init mem_cgroup_swap_init(void)
7314 {
7315 if (!mem_cgroup_disabled() && really_do_swap_account) {
7316 do_swap_account = 1;
7317 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
7318 swap_files));
7319 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
7320 memsw_cgroup_files));
7321 }
7322 return 0;
7323 }
7324 subsys_initcall(mem_cgroup_swap_init);
7325
7326 #endif