This source file includes following definitions.
- set_task_reclaim_state
- prealloc_memcg_shrinker
- unregister_memcg_shrinker
- global_reclaim
- sane_reclaim
- set_memcg_congestion
- memcg_congested
- prealloc_memcg_shrinker
- unregister_memcg_shrinker
- global_reclaim
- sane_reclaim
- set_memcg_congestion
- memcg_congested
- zone_reclaimable_pages
- lruvec_lru_size
- prealloc_shrinker
- free_prealloced_shrinker
- register_shrinker_prepared
- register_shrinker
- unregister_shrinker
- do_shrink_slab
- shrink_slab_memcg
- shrink_slab_memcg
- shrink_slab
- drop_slab_node
- drop_slab
- is_page_cache_freeable
- may_write_to_inode
- handle_write_error
- __remove_mapping
- remove_mapping
- putback_lru_page
- page_check_references
- page_check_dirty_writeback
- shrink_page_list
- reclaim_clean_pages_from_list
- __isolate_lru_page
- update_lru_sizes
- isolate_lru_pages
- isolate_lru_page
- too_many_isolated
- move_pages_to_lru
- current_may_throttle
- shrink_inactive_list
- shrink_active_list
- reclaim_pages
- inactive_list_is_low
- shrink_list
- get_scan_count
- shrink_node_memcg
- in_reclaim_compaction
- should_continue_reclaim
- pgdat_memcg_congested
- shrink_node
- compaction_ready
- shrink_zones
- snapshot_refaults
- do_try_to_free_pages
- allow_direct_reclaim
- throttle_direct_reclaim
- try_to_free_pages
- mem_cgroup_shrink_node
- try_to_free_mem_cgroup_pages
- age_active_anon
- pgdat_watermark_boosted
- pgdat_balanced
- clear_pgdat_congested
- prepare_kswapd_sleep
- kswapd_shrink_node
- balance_pgdat
- kswapd_classzone_idx
- kswapd_try_to_sleep
- kswapd
- wakeup_kswapd
- shrink_all_memory
- kswapd_cpu_online
- kswapd_run
- kswapd_stop
- kswapd_init
- node_unmapped_file_pages
- node_pagecache_reclaimable
- __node_reclaim
- node_reclaim
- page_evictable
- check_move_unevictable_pages
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/mm.h>
18 #include <linux/sched/mm.h>
19 #include <linux/module.h>
20 #include <linux/gfp.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/swap.h>
23 #include <linux/pagemap.h>
24 #include <linux/init.h>
25 #include <linux/highmem.h>
26 #include <linux/vmpressure.h>
27 #include <linux/vmstat.h>
28 #include <linux/file.h>
29 #include <linux/writeback.h>
30 #include <linux/blkdev.h>
31 #include <linux/buffer_head.h>
32
33 #include <linux/mm_inline.h>
34 #include <linux/backing-dev.h>
35 #include <linux/rmap.h>
36 #include <linux/topology.h>
37 #include <linux/cpu.h>
38 #include <linux/cpuset.h>
39 #include <linux/compaction.h>
40 #include <linux/notifier.h>
41 #include <linux/rwsem.h>
42 #include <linux/delay.h>
43 #include <linux/kthread.h>
44 #include <linux/freezer.h>
45 #include <linux/memcontrol.h>
46 #include <linux/delayacct.h>
47 #include <linux/sysctl.h>
48 #include <linux/oom.h>
49 #include <linux/pagevec.h>
50 #include <linux/prefetch.h>
51 #include <linux/printk.h>
52 #include <linux/dax.h>
53 #include <linux/psi.h>
54
55 #include <asm/tlbflush.h>
56 #include <asm/div64.h>
57
58 #include <linux/swapops.h>
59 #include <linux/balloon_compaction.h>
60
61 #include "internal.h"
62
63 #define CREATE_TRACE_POINTS
64 #include <trace/events/vmscan.h>
65
66 struct scan_control {
67
68 unsigned long nr_to_reclaim;
69
70
71
72
73
74 nodemask_t *nodemask;
75
76
77
78
79
80 struct mem_cgroup *target_mem_cgroup;
81
82
83 unsigned int may_writepage:1;
84
85
86 unsigned int may_unmap:1;
87
88
89 unsigned int may_swap:1;
90
91
92
93
94
95
96 unsigned int memcg_low_reclaim:1;
97 unsigned int memcg_low_skipped:1;
98
99 unsigned int hibernation_mode:1;
100
101
102 unsigned int compaction_ready:1;
103
104
105 s8 order;
106
107
108 s8 priority;
109
110
111 s8 reclaim_idx;
112
113
114 gfp_t gfp_mask;
115
116
117 unsigned long nr_scanned;
118
119
120 unsigned long nr_reclaimed;
121
122 struct {
123 unsigned int dirty;
124 unsigned int unqueued_dirty;
125 unsigned int congested;
126 unsigned int writeback;
127 unsigned int immediate;
128 unsigned int file_taken;
129 unsigned int taken;
130 } nr;
131
132
133 struct reclaim_state reclaim_state;
134 };
135
136 #ifdef ARCH_HAS_PREFETCH
137 #define prefetch_prev_lru_page(_page, _base, _field) \
138 do { \
139 if ((_page)->lru.prev != _base) { \
140 struct page *prev; \
141 \
142 prev = lru_to_page(&(_page->lru)); \
143 prefetch(&prev->_field); \
144 } \
145 } while (0)
146 #else
147 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
148 #endif
149
150 #ifdef ARCH_HAS_PREFETCHW
151 #define prefetchw_prev_lru_page(_page, _base, _field) \
152 do { \
153 if ((_page)->lru.prev != _base) { \
154 struct page *prev; \
155 \
156 prev = lru_to_page(&(_page->lru)); \
157 prefetchw(&prev->_field); \
158 } \
159 } while (0)
160 #else
161 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
162 #endif
163
164
165
166
167 int vm_swappiness = 60;
168
169
170
171
172 unsigned long vm_total_pages;
173
174 static void set_task_reclaim_state(struct task_struct *task,
175 struct reclaim_state *rs)
176 {
177
178 WARN_ON_ONCE(rs && task->reclaim_state);
179
180
181 WARN_ON_ONCE(!rs && !task->reclaim_state);
182
183 task->reclaim_state = rs;
184 }
185
186 static LIST_HEAD(shrinker_list);
187 static DECLARE_RWSEM(shrinker_rwsem);
188
189 #ifdef CONFIG_MEMCG
190
191
192
193
194
195
196
197
198
199
200
201 #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
202
203 static DEFINE_IDR(shrinker_idr);
204 static int shrinker_nr_max;
205
206 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
207 {
208 int id, ret = -ENOMEM;
209
210 down_write(&shrinker_rwsem);
211
212 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
213 if (id < 0)
214 goto unlock;
215
216 if (id >= shrinker_nr_max) {
217 if (memcg_expand_shrinker_maps(id)) {
218 idr_remove(&shrinker_idr, id);
219 goto unlock;
220 }
221
222 shrinker_nr_max = id + 1;
223 }
224 shrinker->id = id;
225 ret = 0;
226 unlock:
227 up_write(&shrinker_rwsem);
228 return ret;
229 }
230
231 static void unregister_memcg_shrinker(struct shrinker *shrinker)
232 {
233 int id = shrinker->id;
234
235 BUG_ON(id < 0);
236
237 down_write(&shrinker_rwsem);
238 idr_remove(&shrinker_idr, id);
239 up_write(&shrinker_rwsem);
240 }
241
242 static bool global_reclaim(struct scan_control *sc)
243 {
244 return !sc->target_mem_cgroup;
245 }
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260 static bool sane_reclaim(struct scan_control *sc)
261 {
262 struct mem_cgroup *memcg = sc->target_mem_cgroup;
263
264 if (!memcg)
265 return true;
266 #ifdef CONFIG_CGROUP_WRITEBACK
267 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
268 return true;
269 #endif
270 return false;
271 }
272
273 static void set_memcg_congestion(pg_data_t *pgdat,
274 struct mem_cgroup *memcg,
275 bool congested)
276 {
277 struct mem_cgroup_per_node *mn;
278
279 if (!memcg)
280 return;
281
282 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
283 WRITE_ONCE(mn->congested, congested);
284 }
285
286 static bool memcg_congested(pg_data_t *pgdat,
287 struct mem_cgroup *memcg)
288 {
289 struct mem_cgroup_per_node *mn;
290
291 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
292 return READ_ONCE(mn->congested);
293
294 }
295 #else
296 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
297 {
298 return 0;
299 }
300
301 static void unregister_memcg_shrinker(struct shrinker *shrinker)
302 {
303 }
304
305 static bool global_reclaim(struct scan_control *sc)
306 {
307 return true;
308 }
309
310 static bool sane_reclaim(struct scan_control *sc)
311 {
312 return true;
313 }
314
315 static inline void set_memcg_congestion(struct pglist_data *pgdat,
316 struct mem_cgroup *memcg, bool congested)
317 {
318 }
319
320 static inline bool memcg_congested(struct pglist_data *pgdat,
321 struct mem_cgroup *memcg)
322 {
323 return false;
324
325 }
326 #endif
327
328
329
330
331
332
333 unsigned long zone_reclaimable_pages(struct zone *zone)
334 {
335 unsigned long nr;
336
337 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
338 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
339 if (get_nr_swap_pages() > 0)
340 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
341 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
342
343 return nr;
344 }
345
346
347
348
349
350
351
352 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
353 {
354 unsigned long lru_size = 0;
355 int zid;
356
357 if (!mem_cgroup_disabled()) {
358 for (zid = 0; zid < MAX_NR_ZONES; zid++)
359 lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
360 } else
361 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
362
363 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
364 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
365 unsigned long size;
366
367 if (!managed_zone(zone))
368 continue;
369
370 if (!mem_cgroup_disabled())
371 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
372 else
373 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
374 NR_ZONE_LRU_BASE + lru);
375 lru_size -= min(size, lru_size);
376 }
377
378 return lru_size;
379
380 }
381
382
383
384
385 int prealloc_shrinker(struct shrinker *shrinker)
386 {
387 unsigned int size = sizeof(*shrinker->nr_deferred);
388
389 if (shrinker->flags & SHRINKER_NUMA_AWARE)
390 size *= nr_node_ids;
391
392 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
393 if (!shrinker->nr_deferred)
394 return -ENOMEM;
395
396 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
397 if (prealloc_memcg_shrinker(shrinker))
398 goto free_deferred;
399 }
400
401 return 0;
402
403 free_deferred:
404 kfree(shrinker->nr_deferred);
405 shrinker->nr_deferred = NULL;
406 return -ENOMEM;
407 }
408
409 void free_prealloced_shrinker(struct shrinker *shrinker)
410 {
411 if (!shrinker->nr_deferred)
412 return;
413
414 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
415 unregister_memcg_shrinker(shrinker);
416
417 kfree(shrinker->nr_deferred);
418 shrinker->nr_deferred = NULL;
419 }
420
421 void register_shrinker_prepared(struct shrinker *shrinker)
422 {
423 down_write(&shrinker_rwsem);
424 list_add_tail(&shrinker->list, &shrinker_list);
425 #ifdef CONFIG_MEMCG
426 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
427 idr_replace(&shrinker_idr, shrinker, shrinker->id);
428 #endif
429 up_write(&shrinker_rwsem);
430 }
431
432 int register_shrinker(struct shrinker *shrinker)
433 {
434 int err = prealloc_shrinker(shrinker);
435
436 if (err)
437 return err;
438 register_shrinker_prepared(shrinker);
439 return 0;
440 }
441 EXPORT_SYMBOL(register_shrinker);
442
443
444
445
446 void unregister_shrinker(struct shrinker *shrinker)
447 {
448 if (!shrinker->nr_deferred)
449 return;
450 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
451 unregister_memcg_shrinker(shrinker);
452 down_write(&shrinker_rwsem);
453 list_del(&shrinker->list);
454 up_write(&shrinker_rwsem);
455 kfree(shrinker->nr_deferred);
456 shrinker->nr_deferred = NULL;
457 }
458 EXPORT_SYMBOL(unregister_shrinker);
459
460 #define SHRINK_BATCH 128
461
462 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
463 struct shrinker *shrinker, int priority)
464 {
465 unsigned long freed = 0;
466 unsigned long long delta;
467 long total_scan;
468 long freeable;
469 long nr;
470 long new_nr;
471 int nid = shrinkctl->nid;
472 long batch_size = shrinker->batch ? shrinker->batch
473 : SHRINK_BATCH;
474 long scanned = 0, next_deferred;
475
476 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
477 nid = 0;
478
479 freeable = shrinker->count_objects(shrinker, shrinkctl);
480 if (freeable == 0 || freeable == SHRINK_EMPTY)
481 return freeable;
482
483
484
485
486
487
488 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
489
490 total_scan = nr;
491 if (shrinker->seeks) {
492 delta = freeable >> priority;
493 delta *= 4;
494 do_div(delta, shrinker->seeks);
495 } else {
496
497
498
499
500
501 delta = freeable / 2;
502 }
503
504 total_scan += delta;
505 if (total_scan < 0) {
506 pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
507 shrinker->scan_objects, total_scan);
508 total_scan = freeable;
509 next_deferred = nr;
510 } else
511 next_deferred = total_scan;
512
513
514
515
516
517
518
519
520
521
522
523
524
525 if (delta < freeable / 4)
526 total_scan = min(total_scan, freeable / 2);
527
528
529
530
531
532
533 if (total_scan > freeable * 2)
534 total_scan = freeable * 2;
535
536 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
537 freeable, delta, total_scan, priority);
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554 while (total_scan >= batch_size ||
555 total_scan >= freeable) {
556 unsigned long ret;
557 unsigned long nr_to_scan = min(batch_size, total_scan);
558
559 shrinkctl->nr_to_scan = nr_to_scan;
560 shrinkctl->nr_scanned = nr_to_scan;
561 ret = shrinker->scan_objects(shrinker, shrinkctl);
562 if (ret == SHRINK_STOP)
563 break;
564 freed += ret;
565
566 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
567 total_scan -= shrinkctl->nr_scanned;
568 scanned += shrinkctl->nr_scanned;
569
570 cond_resched();
571 }
572
573 if (next_deferred >= scanned)
574 next_deferred -= scanned;
575 else
576 next_deferred = 0;
577
578
579
580
581
582 if (next_deferred > 0)
583 new_nr = atomic_long_add_return(next_deferred,
584 &shrinker->nr_deferred[nid]);
585 else
586 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
587
588 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
589 return freed;
590 }
591
592 #ifdef CONFIG_MEMCG
593 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
594 struct mem_cgroup *memcg, int priority)
595 {
596 struct memcg_shrinker_map *map;
597 unsigned long ret, freed = 0;
598 int i;
599
600 if (!mem_cgroup_online(memcg))
601 return 0;
602
603 if (!down_read_trylock(&shrinker_rwsem))
604 return 0;
605
606 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
607 true);
608 if (unlikely(!map))
609 goto unlock;
610
611 for_each_set_bit(i, map->map, shrinker_nr_max) {
612 struct shrink_control sc = {
613 .gfp_mask = gfp_mask,
614 .nid = nid,
615 .memcg = memcg,
616 };
617 struct shrinker *shrinker;
618
619 shrinker = idr_find(&shrinker_idr, i);
620 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
621 if (!shrinker)
622 clear_bit(i, map->map);
623 continue;
624 }
625
626
627 if (!memcg_kmem_enabled() &&
628 !(shrinker->flags & SHRINKER_NONSLAB))
629 continue;
630
631 ret = do_shrink_slab(&sc, shrinker, priority);
632 if (ret == SHRINK_EMPTY) {
633 clear_bit(i, map->map);
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649 smp_mb__after_atomic();
650 ret = do_shrink_slab(&sc, shrinker, priority);
651 if (ret == SHRINK_EMPTY)
652 ret = 0;
653 else
654 memcg_set_shrinker_bit(memcg, nid, i);
655 }
656 freed += ret;
657
658 if (rwsem_is_contended(&shrinker_rwsem)) {
659 freed = freed ? : 1;
660 break;
661 }
662 }
663 unlock:
664 up_read(&shrinker_rwsem);
665 return freed;
666 }
667 #else
668 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
669 struct mem_cgroup *memcg, int priority)
670 {
671 return 0;
672 }
673 #endif
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
696 struct mem_cgroup *memcg,
697 int priority)
698 {
699 unsigned long ret, freed = 0;
700 struct shrinker *shrinker;
701
702
703
704
705
706
707
708
709 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
710 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
711
712 if (!down_read_trylock(&shrinker_rwsem))
713 goto out;
714
715 list_for_each_entry(shrinker, &shrinker_list, list) {
716 struct shrink_control sc = {
717 .gfp_mask = gfp_mask,
718 .nid = nid,
719 .memcg = memcg,
720 };
721
722 ret = do_shrink_slab(&sc, shrinker, priority);
723 if (ret == SHRINK_EMPTY)
724 ret = 0;
725 freed += ret;
726
727
728
729
730
731 if (rwsem_is_contended(&shrinker_rwsem)) {
732 freed = freed ? : 1;
733 break;
734 }
735 }
736
737 up_read(&shrinker_rwsem);
738 out:
739 cond_resched();
740 return freed;
741 }
742
743 void drop_slab_node(int nid)
744 {
745 unsigned long freed;
746
747 do {
748 struct mem_cgroup *memcg = NULL;
749
750 freed = 0;
751 memcg = mem_cgroup_iter(NULL, NULL, NULL);
752 do {
753 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
754 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
755 } while (freed > 10);
756 }
757
758 void drop_slab(void)
759 {
760 int nid;
761
762 for_each_online_node(nid)
763 drop_slab_node(nid);
764 }
765
766 static inline int is_page_cache_freeable(struct page *page)
767 {
768
769
770
771
772
773 int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
774 HPAGE_PMD_NR : 1;
775 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
776 }
777
778 static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
779 {
780 if (current->flags & PF_SWAPWRITE)
781 return 1;
782 if (!inode_write_congested(inode))
783 return 1;
784 if (inode_to_bdi(inode) == current->backing_dev_info)
785 return 1;
786 return 0;
787 }
788
789
790
791
792
793
794
795
796
797
798
799
800
801 static void handle_write_error(struct address_space *mapping,
802 struct page *page, int error)
803 {
804 lock_page(page);
805 if (page_mapping(page) == mapping)
806 mapping_set_error(mapping, error);
807 unlock_page(page);
808 }
809
810
811 typedef enum {
812
813 PAGE_KEEP,
814
815 PAGE_ACTIVATE,
816
817 PAGE_SUCCESS,
818
819 PAGE_CLEAN,
820 } pageout_t;
821
822
823
824
825
826 static pageout_t pageout(struct page *page, struct address_space *mapping,
827 struct scan_control *sc)
828 {
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845 if (!is_page_cache_freeable(page))
846 return PAGE_KEEP;
847 if (!mapping) {
848
849
850
851
852 if (page_has_private(page)) {
853 if (try_to_free_buffers(page)) {
854 ClearPageDirty(page);
855 pr_info("%s: orphaned page\n", __func__);
856 return PAGE_CLEAN;
857 }
858 }
859 return PAGE_KEEP;
860 }
861 if (mapping->a_ops->writepage == NULL)
862 return PAGE_ACTIVATE;
863 if (!may_write_to_inode(mapping->host, sc))
864 return PAGE_KEEP;
865
866 if (clear_page_dirty_for_io(page)) {
867 int res;
868 struct writeback_control wbc = {
869 .sync_mode = WB_SYNC_NONE,
870 .nr_to_write = SWAP_CLUSTER_MAX,
871 .range_start = 0,
872 .range_end = LLONG_MAX,
873 .for_reclaim = 1,
874 };
875
876 SetPageReclaim(page);
877 res = mapping->a_ops->writepage(page, &wbc);
878 if (res < 0)
879 handle_write_error(mapping, page, res);
880 if (res == AOP_WRITEPAGE_ACTIVATE) {
881 ClearPageReclaim(page);
882 return PAGE_ACTIVATE;
883 }
884
885 if (!PageWriteback(page)) {
886
887 ClearPageReclaim(page);
888 }
889 trace_mm_vmscan_writepage(page);
890 inc_node_page_state(page, NR_VMSCAN_WRITE);
891 return PAGE_SUCCESS;
892 }
893
894 return PAGE_CLEAN;
895 }
896
897
898
899
900
901 static int __remove_mapping(struct address_space *mapping, struct page *page,
902 bool reclaimed)
903 {
904 unsigned long flags;
905 int refcount;
906
907 BUG_ON(!PageLocked(page));
908 BUG_ON(mapping != page_mapping(page));
909
910 xa_lock_irqsave(&mapping->i_pages, flags);
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936 refcount = 1 + compound_nr(page);
937 if (!page_ref_freeze(page, refcount))
938 goto cannot_free;
939
940 if (unlikely(PageDirty(page))) {
941 page_ref_unfreeze(page, refcount);
942 goto cannot_free;
943 }
944
945 if (PageSwapCache(page)) {
946 swp_entry_t swap = { .val = page_private(page) };
947 mem_cgroup_swapout(page, swap);
948 __delete_from_swap_cache(page, swap);
949 xa_unlock_irqrestore(&mapping->i_pages, flags);
950 put_swap_page(page, swap);
951 } else {
952 void (*freepage)(struct page *);
953 void *shadow = NULL;
954
955 freepage = mapping->a_ops->freepage;
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972 if (reclaimed && page_is_file_cache(page) &&
973 !mapping_exiting(mapping) && !dax_mapping(mapping))
974 shadow = workingset_eviction(page);
975 __delete_from_page_cache(page, shadow);
976 xa_unlock_irqrestore(&mapping->i_pages, flags);
977
978 if (freepage != NULL)
979 freepage(page);
980 }
981
982 return 1;
983
984 cannot_free:
985 xa_unlock_irqrestore(&mapping->i_pages, flags);
986 return 0;
987 }
988
989
990
991
992
993
994
995 int remove_mapping(struct address_space *mapping, struct page *page)
996 {
997 if (__remove_mapping(mapping, page, false)) {
998
999
1000
1001
1002
1003 page_ref_unfreeze(page, 1);
1004 return 1;
1005 }
1006 return 0;
1007 }
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018 void putback_lru_page(struct page *page)
1019 {
1020 lru_cache_add(page);
1021 put_page(page);
1022 }
1023
1024 enum page_references {
1025 PAGEREF_RECLAIM,
1026 PAGEREF_RECLAIM_CLEAN,
1027 PAGEREF_KEEP,
1028 PAGEREF_ACTIVATE,
1029 };
1030
1031 static enum page_references page_check_references(struct page *page,
1032 struct scan_control *sc)
1033 {
1034 int referenced_ptes, referenced_page;
1035 unsigned long vm_flags;
1036
1037 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1038 &vm_flags);
1039 referenced_page = TestClearPageReferenced(page);
1040
1041
1042
1043
1044
1045 if (vm_flags & VM_LOCKED)
1046 return PAGEREF_RECLAIM;
1047
1048 if (referenced_ptes) {
1049 if (PageSwapBacked(page))
1050 return PAGEREF_ACTIVATE;
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 SetPageReferenced(page);
1066
1067 if (referenced_page || referenced_ptes > 1)
1068 return PAGEREF_ACTIVATE;
1069
1070
1071
1072
1073 if (vm_flags & VM_EXEC)
1074 return PAGEREF_ACTIVATE;
1075
1076 return PAGEREF_KEEP;
1077 }
1078
1079
1080 if (referenced_page && !PageSwapBacked(page))
1081 return PAGEREF_RECLAIM_CLEAN;
1082
1083 return PAGEREF_RECLAIM;
1084 }
1085
1086
1087 static void page_check_dirty_writeback(struct page *page,
1088 bool *dirty, bool *writeback)
1089 {
1090 struct address_space *mapping;
1091
1092
1093
1094
1095
1096 if (!page_is_file_cache(page) ||
1097 (PageAnon(page) && !PageSwapBacked(page))) {
1098 *dirty = false;
1099 *writeback = false;
1100 return;
1101 }
1102
1103
1104 *dirty = PageDirty(page);
1105 *writeback = PageWriteback(page);
1106
1107
1108 if (!page_has_private(page))
1109 return;
1110
1111 mapping = page_mapping(page);
1112 if (mapping && mapping->a_ops->is_dirty_writeback)
1113 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1114 }
1115
1116
1117
1118
1119 static unsigned long shrink_page_list(struct list_head *page_list,
1120 struct pglist_data *pgdat,
1121 struct scan_control *sc,
1122 enum ttu_flags ttu_flags,
1123 struct reclaim_stat *stat,
1124 bool ignore_references)
1125 {
1126 LIST_HEAD(ret_pages);
1127 LIST_HEAD(free_pages);
1128 unsigned nr_reclaimed = 0;
1129 unsigned pgactivate = 0;
1130
1131 memset(stat, 0, sizeof(*stat));
1132 cond_resched();
1133
1134 while (!list_empty(page_list)) {
1135 struct address_space *mapping;
1136 struct page *page;
1137 int may_enter_fs;
1138 enum page_references references = PAGEREF_RECLAIM;
1139 bool dirty, writeback;
1140 unsigned int nr_pages;
1141
1142 cond_resched();
1143
1144 page = lru_to_page(page_list);
1145 list_del(&page->lru);
1146
1147 if (!trylock_page(page))
1148 goto keep;
1149
1150 VM_BUG_ON_PAGE(PageActive(page), page);
1151
1152 nr_pages = compound_nr(page);
1153
1154
1155 sc->nr_scanned += nr_pages;
1156
1157 if (unlikely(!page_evictable(page)))
1158 goto activate_locked;
1159
1160 if (!sc->may_unmap && page_mapped(page))
1161 goto keep_locked;
1162
1163 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1164 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1165
1166
1167
1168
1169
1170
1171
1172 page_check_dirty_writeback(page, &dirty, &writeback);
1173 if (dirty || writeback)
1174 stat->nr_dirty++;
1175
1176 if (dirty && !writeback)
1177 stat->nr_unqueued_dirty++;
1178
1179
1180
1181
1182
1183
1184
1185 mapping = page_mapping(page);
1186 if (((dirty || writeback) && mapping &&
1187 inode_write_congested(mapping->host)) ||
1188 (writeback && PageReclaim(page)))
1189 stat->nr_congested++;
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233 if (PageWriteback(page)) {
1234
1235 if (current_is_kswapd() &&
1236 PageReclaim(page) &&
1237 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1238 stat->nr_immediate++;
1239 goto activate_locked;
1240
1241
1242 } else if (sane_reclaim(sc) ||
1243 !PageReclaim(page) || !may_enter_fs) {
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255 SetPageReclaim(page);
1256 stat->nr_writeback++;
1257 goto activate_locked;
1258
1259
1260 } else {
1261 unlock_page(page);
1262 wait_on_page_writeback(page);
1263
1264 list_add_tail(&page->lru, page_list);
1265 continue;
1266 }
1267 }
1268
1269 if (!ignore_references)
1270 references = page_check_references(page, sc);
1271
1272 switch (references) {
1273 case PAGEREF_ACTIVATE:
1274 goto activate_locked;
1275 case PAGEREF_KEEP:
1276 stat->nr_ref_keep += nr_pages;
1277 goto keep_locked;
1278 case PAGEREF_RECLAIM:
1279 case PAGEREF_RECLAIM_CLEAN:
1280 ;
1281 }
1282
1283
1284
1285
1286
1287
1288 if (PageAnon(page) && PageSwapBacked(page)) {
1289 if (!PageSwapCache(page)) {
1290 if (!(sc->gfp_mask & __GFP_IO))
1291 goto keep_locked;
1292 if (PageTransHuge(page)) {
1293
1294 if (!can_split_huge_page(page, NULL))
1295 goto activate_locked;
1296
1297
1298
1299
1300
1301 if (!compound_mapcount(page) &&
1302 split_huge_page_to_list(page,
1303 page_list))
1304 goto activate_locked;
1305 }
1306 if (!add_to_swap(page)) {
1307 if (!PageTransHuge(page))
1308 goto activate_locked_split;
1309
1310 if (split_huge_page_to_list(page,
1311 page_list))
1312 goto activate_locked;
1313 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1314 count_vm_event(THP_SWPOUT_FALLBACK);
1315 #endif
1316 if (!add_to_swap(page))
1317 goto activate_locked_split;
1318 }
1319
1320 may_enter_fs = 1;
1321
1322
1323 mapping = page_mapping(page);
1324 }
1325 } else if (unlikely(PageTransHuge(page))) {
1326
1327 if (split_huge_page_to_list(page, page_list))
1328 goto keep_locked;
1329 }
1330
1331
1332
1333
1334
1335
1336
1337
1338 if ((nr_pages > 1) && !PageTransHuge(page)) {
1339 sc->nr_scanned -= (nr_pages - 1);
1340 nr_pages = 1;
1341 }
1342
1343
1344
1345
1346
1347 if (page_mapped(page)) {
1348 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1349
1350 if (unlikely(PageTransHuge(page)))
1351 flags |= TTU_SPLIT_HUGE_PMD;
1352 if (!try_to_unmap(page, flags)) {
1353 stat->nr_unmap_fail += nr_pages;
1354 goto activate_locked;
1355 }
1356 }
1357
1358 if (PageDirty(page)) {
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369 if (page_is_file_cache(page) &&
1370 (!current_is_kswapd() || !PageReclaim(page) ||
1371 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1372
1373
1374
1375
1376
1377
1378 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1379 SetPageReclaim(page);
1380
1381 goto activate_locked;
1382 }
1383
1384 if (references == PAGEREF_RECLAIM_CLEAN)
1385 goto keep_locked;
1386 if (!may_enter_fs)
1387 goto keep_locked;
1388 if (!sc->may_writepage)
1389 goto keep_locked;
1390
1391
1392
1393
1394
1395
1396 try_to_unmap_flush_dirty();
1397 switch (pageout(page, mapping, sc)) {
1398 case PAGE_KEEP:
1399 goto keep_locked;
1400 case PAGE_ACTIVATE:
1401 goto activate_locked;
1402 case PAGE_SUCCESS:
1403 if (PageWriteback(page))
1404 goto keep;
1405 if (PageDirty(page))
1406 goto keep;
1407
1408
1409
1410
1411
1412 if (!trylock_page(page))
1413 goto keep;
1414 if (PageDirty(page) || PageWriteback(page))
1415 goto keep_locked;
1416 mapping = page_mapping(page);
1417 case PAGE_CLEAN:
1418 ;
1419 }
1420 }
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443 if (page_has_private(page)) {
1444 if (!try_to_release_page(page, sc->gfp_mask))
1445 goto activate_locked;
1446 if (!mapping && page_count(page) == 1) {
1447 unlock_page(page);
1448 if (put_page_testzero(page))
1449 goto free_it;
1450 else {
1451
1452
1453
1454
1455
1456
1457
1458 nr_reclaimed++;
1459 continue;
1460 }
1461 }
1462 }
1463
1464 if (PageAnon(page) && !PageSwapBacked(page)) {
1465
1466 if (!page_ref_freeze(page, 1))
1467 goto keep_locked;
1468 if (PageDirty(page)) {
1469 page_ref_unfreeze(page, 1);
1470 goto keep_locked;
1471 }
1472
1473 count_vm_event(PGLAZYFREED);
1474 count_memcg_page_event(page, PGLAZYFREED);
1475 } else if (!mapping || !__remove_mapping(mapping, page, true))
1476 goto keep_locked;
1477
1478 unlock_page(page);
1479 free_it:
1480
1481
1482
1483
1484 nr_reclaimed += nr_pages;
1485
1486
1487
1488
1489
1490 if (unlikely(PageTransHuge(page)))
1491 (*get_compound_page_dtor(page))(page);
1492 else
1493 list_add(&page->lru, &free_pages);
1494 continue;
1495
1496 activate_locked_split:
1497
1498
1499
1500
1501 if (nr_pages > 1) {
1502 sc->nr_scanned -= (nr_pages - 1);
1503 nr_pages = 1;
1504 }
1505 activate_locked:
1506
1507 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1508 PageMlocked(page)))
1509 try_to_free_swap(page);
1510 VM_BUG_ON_PAGE(PageActive(page), page);
1511 if (!PageMlocked(page)) {
1512 int type = page_is_file_cache(page);
1513 SetPageActive(page);
1514 stat->nr_activate[type] += nr_pages;
1515 count_memcg_page_event(page, PGACTIVATE);
1516 }
1517 keep_locked:
1518 unlock_page(page);
1519 keep:
1520 list_add(&page->lru, &ret_pages);
1521 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1522 }
1523
1524 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1525
1526 mem_cgroup_uncharge_list(&free_pages);
1527 try_to_unmap_flush();
1528 free_unref_page_list(&free_pages);
1529
1530 list_splice(&ret_pages, page_list);
1531 count_vm_events(PGACTIVATE, pgactivate);
1532
1533 return nr_reclaimed;
1534 }
1535
1536 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1537 struct list_head *page_list)
1538 {
1539 struct scan_control sc = {
1540 .gfp_mask = GFP_KERNEL,
1541 .priority = DEF_PRIORITY,
1542 .may_unmap = 1,
1543 };
1544 struct reclaim_stat dummy_stat;
1545 unsigned long ret;
1546 struct page *page, *next;
1547 LIST_HEAD(clean_pages);
1548
1549 list_for_each_entry_safe(page, next, page_list, lru) {
1550 if (page_is_file_cache(page) && !PageDirty(page) &&
1551 !__PageMovable(page) && !PageUnevictable(page)) {
1552 ClearPageActive(page);
1553 list_move(&page->lru, &clean_pages);
1554 }
1555 }
1556
1557 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1558 TTU_IGNORE_ACCESS, &dummy_stat, true);
1559 list_splice(&clean_pages, page_list);
1560 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1561 return ret;
1562 }
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1575 {
1576 int ret = -EINVAL;
1577
1578
1579 if (!PageLRU(page))
1580 return ret;
1581
1582
1583 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1584 return ret;
1585
1586 ret = -EBUSY;
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596 if (mode & ISOLATE_ASYNC_MIGRATE) {
1597
1598 if (PageWriteback(page))
1599 return ret;
1600
1601 if (PageDirty(page)) {
1602 struct address_space *mapping;
1603 bool migrate_dirty;
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614 if (!trylock_page(page))
1615 return ret;
1616
1617 mapping = page_mapping(page);
1618 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1619 unlock_page(page);
1620 if (!migrate_dirty)
1621 return ret;
1622 }
1623 }
1624
1625 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1626 return ret;
1627
1628 if (likely(get_page_unless_zero(page))) {
1629
1630
1631
1632
1633
1634 ClearPageLRU(page);
1635 ret = 0;
1636 }
1637
1638 return ret;
1639 }
1640
1641
1642
1643
1644
1645
1646 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1647 enum lru_list lru, unsigned long *nr_zone_taken)
1648 {
1649 int zid;
1650
1651 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1652 if (!nr_zone_taken[zid])
1653 continue;
1654
1655 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1656 #ifdef CONFIG_MEMCG
1657 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1658 #endif
1659 }
1660
1661 }
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1684 struct lruvec *lruvec, struct list_head *dst,
1685 unsigned long *nr_scanned, struct scan_control *sc,
1686 enum lru_list lru)
1687 {
1688 struct list_head *src = &lruvec->lists[lru];
1689 unsigned long nr_taken = 0;
1690 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1691 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1692 unsigned long skipped = 0;
1693 unsigned long scan, total_scan, nr_pages;
1694 LIST_HEAD(pages_skipped);
1695 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1696
1697 total_scan = 0;
1698 scan = 0;
1699 while (scan < nr_to_scan && !list_empty(src)) {
1700 struct page *page;
1701
1702 page = lru_to_page(src);
1703 prefetchw_prev_lru_page(page, src, flags);
1704
1705 VM_BUG_ON_PAGE(!PageLRU(page), page);
1706
1707 nr_pages = compound_nr(page);
1708 total_scan += nr_pages;
1709
1710 if (page_zonenum(page) > sc->reclaim_idx) {
1711 list_move(&page->lru, &pages_skipped);
1712 nr_skipped[page_zonenum(page)] += nr_pages;
1713 continue;
1714 }
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726 scan += nr_pages;
1727 switch (__isolate_lru_page(page, mode)) {
1728 case 0:
1729 nr_taken += nr_pages;
1730 nr_zone_taken[page_zonenum(page)] += nr_pages;
1731 list_move(&page->lru, dst);
1732 break;
1733
1734 case -EBUSY:
1735
1736 list_move(&page->lru, src);
1737 continue;
1738
1739 default:
1740 BUG();
1741 }
1742 }
1743
1744
1745
1746
1747
1748
1749
1750
1751 if (!list_empty(&pages_skipped)) {
1752 int zid;
1753
1754 list_splice(&pages_skipped, src);
1755 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1756 if (!nr_skipped[zid])
1757 continue;
1758
1759 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1760 skipped += nr_skipped[zid];
1761 }
1762 }
1763 *nr_scanned = total_scan;
1764 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1765 total_scan, skipped, nr_taken, mode, lru);
1766 update_lru_sizes(lruvec, lru, nr_zone_taken);
1767 return nr_taken;
1768 }
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796 int isolate_lru_page(struct page *page)
1797 {
1798 int ret = -EBUSY;
1799
1800 VM_BUG_ON_PAGE(!page_count(page), page);
1801 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1802
1803 if (PageLRU(page)) {
1804 pg_data_t *pgdat = page_pgdat(page);
1805 struct lruvec *lruvec;
1806
1807 spin_lock_irq(&pgdat->lru_lock);
1808 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1809 if (PageLRU(page)) {
1810 int lru = page_lru(page);
1811 get_page(page);
1812 ClearPageLRU(page);
1813 del_page_from_lru_list(page, lruvec, lru);
1814 ret = 0;
1815 }
1816 spin_unlock_irq(&pgdat->lru_lock);
1817 }
1818 return ret;
1819 }
1820
1821
1822
1823
1824
1825
1826
1827
1828 static int too_many_isolated(struct pglist_data *pgdat, int file,
1829 struct scan_control *sc)
1830 {
1831 unsigned long inactive, isolated;
1832
1833 if (current_is_kswapd())
1834 return 0;
1835
1836 if (!sane_reclaim(sc))
1837 return 0;
1838
1839 if (file) {
1840 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1841 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1842 } else {
1843 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1844 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1845 }
1846
1847
1848
1849
1850
1851
1852 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1853 inactive >>= 3;
1854
1855 return isolated > inactive;
1856 }
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1879 struct list_head *list)
1880 {
1881 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1882 int nr_pages, nr_moved = 0;
1883 LIST_HEAD(pages_to_free);
1884 struct page *page;
1885 enum lru_list lru;
1886
1887 while (!list_empty(list)) {
1888 page = lru_to_page(list);
1889 VM_BUG_ON_PAGE(PageLRU(page), page);
1890 if (unlikely(!page_evictable(page))) {
1891 list_del(&page->lru);
1892 spin_unlock_irq(&pgdat->lru_lock);
1893 putback_lru_page(page);
1894 spin_lock_irq(&pgdat->lru_lock);
1895 continue;
1896 }
1897 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1898
1899 SetPageLRU(page);
1900 lru = page_lru(page);
1901
1902 nr_pages = hpage_nr_pages(page);
1903 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1904 list_move(&page->lru, &lruvec->lists[lru]);
1905
1906 if (put_page_testzero(page)) {
1907 __ClearPageLRU(page);
1908 __ClearPageActive(page);
1909 del_page_from_lru_list(page, lruvec, lru);
1910
1911 if (unlikely(PageCompound(page))) {
1912 spin_unlock_irq(&pgdat->lru_lock);
1913 (*get_compound_page_dtor(page))(page);
1914 spin_lock_irq(&pgdat->lru_lock);
1915 } else
1916 list_add(&page->lru, &pages_to_free);
1917 } else {
1918 nr_moved += nr_pages;
1919 }
1920 }
1921
1922
1923
1924
1925 list_splice(&pages_to_free, list);
1926
1927 return nr_moved;
1928 }
1929
1930
1931
1932
1933
1934
1935
1936 static int current_may_throttle(void)
1937 {
1938 return !(current->flags & PF_LESS_THROTTLE) ||
1939 current->backing_dev_info == NULL ||
1940 bdi_write_congested(current->backing_dev_info);
1941 }
1942
1943
1944
1945
1946
1947 static noinline_for_stack unsigned long
1948 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1949 struct scan_control *sc, enum lru_list lru)
1950 {
1951 LIST_HEAD(page_list);
1952 unsigned long nr_scanned;
1953 unsigned long nr_reclaimed = 0;
1954 unsigned long nr_taken;
1955 struct reclaim_stat stat;
1956 int file = is_file_lru(lru);
1957 enum vm_event_item item;
1958 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1959 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1960 bool stalled = false;
1961
1962 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1963 if (stalled)
1964 return 0;
1965
1966
1967 msleep(100);
1968 stalled = true;
1969
1970
1971 if (fatal_signal_pending(current))
1972 return SWAP_CLUSTER_MAX;
1973 }
1974
1975 lru_add_drain();
1976
1977 spin_lock_irq(&pgdat->lru_lock);
1978
1979 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1980 &nr_scanned, sc, lru);
1981
1982 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1983 reclaim_stat->recent_scanned[file] += nr_taken;
1984
1985 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1986 if (global_reclaim(sc))
1987 __count_vm_events(item, nr_scanned);
1988 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1989 spin_unlock_irq(&pgdat->lru_lock);
1990
1991 if (nr_taken == 0)
1992 return 0;
1993
1994 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1995 &stat, false);
1996
1997 spin_lock_irq(&pgdat->lru_lock);
1998
1999 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2000 if (global_reclaim(sc))
2001 __count_vm_events(item, nr_reclaimed);
2002 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2003 reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
2004 reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
2005
2006 move_pages_to_lru(lruvec, &page_list);
2007
2008 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2009
2010 spin_unlock_irq(&pgdat->lru_lock);
2011
2012 mem_cgroup_uncharge_list(&page_list);
2013 free_unref_page_list(&page_list);
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026 if (stat.nr_unqueued_dirty == nr_taken)
2027 wakeup_flusher_threads(WB_REASON_VMSCAN);
2028
2029 sc->nr.dirty += stat.nr_dirty;
2030 sc->nr.congested += stat.nr_congested;
2031 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2032 sc->nr.writeback += stat.nr_writeback;
2033 sc->nr.immediate += stat.nr_immediate;
2034 sc->nr.taken += nr_taken;
2035 if (file)
2036 sc->nr.file_taken += nr_taken;
2037
2038 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2039 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2040 return nr_reclaimed;
2041 }
2042
2043 static void shrink_active_list(unsigned long nr_to_scan,
2044 struct lruvec *lruvec,
2045 struct scan_control *sc,
2046 enum lru_list lru)
2047 {
2048 unsigned long nr_taken;
2049 unsigned long nr_scanned;
2050 unsigned long vm_flags;
2051 LIST_HEAD(l_hold);
2052 LIST_HEAD(l_active);
2053 LIST_HEAD(l_inactive);
2054 struct page *page;
2055 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2056 unsigned nr_deactivate, nr_activate;
2057 unsigned nr_rotated = 0;
2058 int file = is_file_lru(lru);
2059 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2060
2061 lru_add_drain();
2062
2063 spin_lock_irq(&pgdat->lru_lock);
2064
2065 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2066 &nr_scanned, sc, lru);
2067
2068 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2069 reclaim_stat->recent_scanned[file] += nr_taken;
2070
2071 __count_vm_events(PGREFILL, nr_scanned);
2072 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2073
2074 spin_unlock_irq(&pgdat->lru_lock);
2075
2076 while (!list_empty(&l_hold)) {
2077 cond_resched();
2078 page = lru_to_page(&l_hold);
2079 list_del(&page->lru);
2080
2081 if (unlikely(!page_evictable(page))) {
2082 putback_lru_page(page);
2083 continue;
2084 }
2085
2086 if (unlikely(buffer_heads_over_limit)) {
2087 if (page_has_private(page) && trylock_page(page)) {
2088 if (page_has_private(page))
2089 try_to_release_page(page, 0);
2090 unlock_page(page);
2091 }
2092 }
2093
2094 if (page_referenced(page, 0, sc->target_mem_cgroup,
2095 &vm_flags)) {
2096 nr_rotated += hpage_nr_pages(page);
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2107 list_add(&page->lru, &l_active);
2108 continue;
2109 }
2110 }
2111
2112 ClearPageActive(page);
2113 SetPageWorkingset(page);
2114 list_add(&page->lru, &l_inactive);
2115 }
2116
2117
2118
2119
2120 spin_lock_irq(&pgdat->lru_lock);
2121
2122
2123
2124
2125
2126
2127 reclaim_stat->recent_rotated[file] += nr_rotated;
2128
2129 nr_activate = move_pages_to_lru(lruvec, &l_active);
2130 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2131
2132 list_splice(&l_inactive, &l_active);
2133
2134 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2135 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2136
2137 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2138 spin_unlock_irq(&pgdat->lru_lock);
2139
2140 mem_cgroup_uncharge_list(&l_active);
2141 free_unref_page_list(&l_active);
2142 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2143 nr_deactivate, nr_rotated, sc->priority, file);
2144 }
2145
2146 unsigned long reclaim_pages(struct list_head *page_list)
2147 {
2148 int nid = -1;
2149 unsigned long nr_reclaimed = 0;
2150 LIST_HEAD(node_page_list);
2151 struct reclaim_stat dummy_stat;
2152 struct page *page;
2153 struct scan_control sc = {
2154 .gfp_mask = GFP_KERNEL,
2155 .priority = DEF_PRIORITY,
2156 .may_writepage = 1,
2157 .may_unmap = 1,
2158 .may_swap = 1,
2159 };
2160
2161 while (!list_empty(page_list)) {
2162 page = lru_to_page(page_list);
2163 if (nid == -1) {
2164 nid = page_to_nid(page);
2165 INIT_LIST_HEAD(&node_page_list);
2166 }
2167
2168 if (nid == page_to_nid(page)) {
2169 ClearPageActive(page);
2170 list_move(&page->lru, &node_page_list);
2171 continue;
2172 }
2173
2174 nr_reclaimed += shrink_page_list(&node_page_list,
2175 NODE_DATA(nid),
2176 &sc, 0,
2177 &dummy_stat, false);
2178 while (!list_empty(&node_page_list)) {
2179 page = lru_to_page(&node_page_list);
2180 list_del(&page->lru);
2181 putback_lru_page(page);
2182 }
2183
2184 nid = -1;
2185 }
2186
2187 if (!list_empty(&node_page_list)) {
2188 nr_reclaimed += shrink_page_list(&node_page_list,
2189 NODE_DATA(nid),
2190 &sc, 0,
2191 &dummy_stat, false);
2192 while (!list_empty(&node_page_list)) {
2193 page = lru_to_page(&node_page_list);
2194 list_del(&page->lru);
2195 putback_lru_page(page);
2196 }
2197 }
2198
2199 return nr_reclaimed;
2200 }
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2231 struct scan_control *sc, bool trace)
2232 {
2233 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2234 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2235 enum lru_list inactive_lru = file * LRU_FILE;
2236 unsigned long inactive, active;
2237 unsigned long inactive_ratio;
2238 unsigned long refaults;
2239 unsigned long gb;
2240
2241
2242
2243
2244
2245 if (!file && !total_swap_pages)
2246 return false;
2247
2248 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2249 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2250
2251
2252
2253
2254
2255
2256 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
2257 if (file && lruvec->refaults != refaults) {
2258 inactive_ratio = 0;
2259 } else {
2260 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2261 if (gb)
2262 inactive_ratio = int_sqrt(10 * gb);
2263 else
2264 inactive_ratio = 1;
2265 }
2266
2267 if (trace)
2268 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2269 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2270 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2271 inactive_ratio, file);
2272
2273 return inactive * inactive_ratio < active;
2274 }
2275
2276 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2277 struct lruvec *lruvec, struct scan_control *sc)
2278 {
2279 if (is_active_lru(lru)) {
2280 if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
2281 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2282 return 0;
2283 }
2284
2285 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2286 }
2287
2288 enum scan_balance {
2289 SCAN_EQUAL,
2290 SCAN_FRACT,
2291 SCAN_ANON,
2292 SCAN_FILE,
2293 };
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304 static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2305 struct scan_control *sc, unsigned long *nr,
2306 unsigned long *lru_pages)
2307 {
2308 int swappiness = mem_cgroup_swappiness(memcg);
2309 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2310 u64 fraction[2];
2311 u64 denominator = 0;
2312 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2313 unsigned long anon_prio, file_prio;
2314 enum scan_balance scan_balance;
2315 unsigned long anon, file;
2316 unsigned long ap, fp;
2317 enum lru_list lru;
2318
2319
2320 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2321 scan_balance = SCAN_FILE;
2322 goto out;
2323 }
2324
2325
2326
2327
2328
2329
2330
2331
2332 if (!global_reclaim(sc) && !swappiness) {
2333 scan_balance = SCAN_FILE;
2334 goto out;
2335 }
2336
2337
2338
2339
2340
2341
2342 if (!sc->priority && swappiness) {
2343 scan_balance = SCAN_EQUAL;
2344 goto out;
2345 }
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356 if (global_reclaim(sc)) {
2357 unsigned long pgdatfile;
2358 unsigned long pgdatfree;
2359 int z;
2360 unsigned long total_high_wmark = 0;
2361
2362 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2363 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2364 node_page_state(pgdat, NR_INACTIVE_FILE);
2365
2366 for (z = 0; z < MAX_NR_ZONES; z++) {
2367 struct zone *zone = &pgdat->node_zones[z];
2368 if (!managed_zone(zone))
2369 continue;
2370
2371 total_high_wmark += high_wmark_pages(zone);
2372 }
2373
2374 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2375
2376
2377
2378
2379
2380 if (!inactive_list_is_low(lruvec, false, sc, false) &&
2381 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2382 >> sc->priority) {
2383 scan_balance = SCAN_ANON;
2384 goto out;
2385 }
2386 }
2387 }
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398 if (!inactive_list_is_low(lruvec, true, sc, false) &&
2399 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2400 scan_balance = SCAN_FILE;
2401 goto out;
2402 }
2403
2404 scan_balance = SCAN_FRACT;
2405
2406
2407
2408
2409
2410 anon_prio = swappiness;
2411 file_prio = 200 - anon_prio;
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2426 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2427 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2428 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2429
2430 spin_lock_irq(&pgdat->lru_lock);
2431 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2432 reclaim_stat->recent_scanned[0] /= 2;
2433 reclaim_stat->recent_rotated[0] /= 2;
2434 }
2435
2436 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2437 reclaim_stat->recent_scanned[1] /= 2;
2438 reclaim_stat->recent_rotated[1] /= 2;
2439 }
2440
2441
2442
2443
2444
2445
2446 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2447 ap /= reclaim_stat->recent_rotated[0] + 1;
2448
2449 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2450 fp /= reclaim_stat->recent_rotated[1] + 1;
2451 spin_unlock_irq(&pgdat->lru_lock);
2452
2453 fraction[0] = ap;
2454 fraction[1] = fp;
2455 denominator = ap + fp + 1;
2456 out:
2457 *lru_pages = 0;
2458 for_each_evictable_lru(lru) {
2459 int file = is_file_lru(lru);
2460 unsigned long lruvec_size;
2461 unsigned long scan;
2462 unsigned long protection;
2463
2464 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2465 protection = mem_cgroup_protection(memcg,
2466 sc->memcg_low_reclaim);
2467
2468 if (protection) {
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498 unsigned long cgroup_size = mem_cgroup_size(memcg);
2499
2500
2501 cgroup_size = max(cgroup_size, protection);
2502
2503 scan = lruvec_size - lruvec_size * protection /
2504 cgroup_size;
2505
2506
2507
2508
2509
2510
2511 scan = max(scan, SWAP_CLUSTER_MAX);
2512 } else {
2513 scan = lruvec_size;
2514 }
2515
2516 scan >>= sc->priority;
2517
2518
2519
2520
2521
2522 if (!scan && !mem_cgroup_online(memcg))
2523 scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2524
2525 switch (scan_balance) {
2526 case SCAN_EQUAL:
2527
2528 break;
2529 case SCAN_FRACT:
2530
2531
2532
2533
2534
2535
2536
2537 scan = mem_cgroup_online(memcg) ?
2538 div64_u64(scan * fraction[file], denominator) :
2539 DIV64_U64_ROUND_UP(scan * fraction[file],
2540 denominator);
2541 break;
2542 case SCAN_FILE:
2543 case SCAN_ANON:
2544
2545 if ((scan_balance == SCAN_FILE) != file) {
2546 lruvec_size = 0;
2547 scan = 0;
2548 }
2549 break;
2550 default:
2551
2552 BUG();
2553 }
2554
2555 *lru_pages += lruvec_size;
2556 nr[lru] = scan;
2557 }
2558 }
2559
2560
2561
2562
2563 static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2564 struct scan_control *sc, unsigned long *lru_pages)
2565 {
2566 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2567 unsigned long nr[NR_LRU_LISTS];
2568 unsigned long targets[NR_LRU_LISTS];
2569 unsigned long nr_to_scan;
2570 enum lru_list lru;
2571 unsigned long nr_reclaimed = 0;
2572 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2573 struct blk_plug plug;
2574 bool scan_adjusted;
2575
2576 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2577
2578
2579 memcpy(targets, nr, sizeof(nr));
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2593 sc->priority == DEF_PRIORITY);
2594
2595 blk_start_plug(&plug);
2596 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2597 nr[LRU_INACTIVE_FILE]) {
2598 unsigned long nr_anon, nr_file, percentage;
2599 unsigned long nr_scanned;
2600
2601 for_each_evictable_lru(lru) {
2602 if (nr[lru]) {
2603 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2604 nr[lru] -= nr_to_scan;
2605
2606 nr_reclaimed += shrink_list(lru, nr_to_scan,
2607 lruvec, sc);
2608 }
2609 }
2610
2611 cond_resched();
2612
2613 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2614 continue;
2615
2616
2617
2618
2619
2620
2621
2622
2623 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2624 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2625
2626
2627
2628
2629
2630
2631
2632 if (!nr_file || !nr_anon)
2633 break;
2634
2635 if (nr_file > nr_anon) {
2636 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2637 targets[LRU_ACTIVE_ANON] + 1;
2638 lru = LRU_BASE;
2639 percentage = nr_anon * 100 / scan_target;
2640 } else {
2641 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2642 targets[LRU_ACTIVE_FILE] + 1;
2643 lru = LRU_FILE;
2644 percentage = nr_file * 100 / scan_target;
2645 }
2646
2647
2648 nr[lru] = 0;
2649 nr[lru + LRU_ACTIVE] = 0;
2650
2651
2652
2653
2654
2655 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2656 nr_scanned = targets[lru] - nr[lru];
2657 nr[lru] = targets[lru] * (100 - percentage) / 100;
2658 nr[lru] -= min(nr[lru], nr_scanned);
2659
2660 lru += LRU_ACTIVE;
2661 nr_scanned = targets[lru] - nr[lru];
2662 nr[lru] = targets[lru] * (100 - percentage) / 100;
2663 nr[lru] -= min(nr[lru], nr_scanned);
2664
2665 scan_adjusted = true;
2666 }
2667 blk_finish_plug(&plug);
2668 sc->nr_reclaimed += nr_reclaimed;
2669
2670
2671
2672
2673
2674 if (inactive_list_is_low(lruvec, false, sc, true))
2675 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2676 sc, LRU_ACTIVE_ANON);
2677 }
2678
2679
2680 static bool in_reclaim_compaction(struct scan_control *sc)
2681 {
2682 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2683 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2684 sc->priority < DEF_PRIORITY - 2))
2685 return true;
2686
2687 return false;
2688 }
2689
2690
2691
2692
2693
2694
2695
2696
2697 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2698 unsigned long nr_reclaimed,
2699 struct scan_control *sc)
2700 {
2701 unsigned long pages_for_compaction;
2702 unsigned long inactive_lru_pages;
2703 int z;
2704
2705
2706 if (!in_reclaim_compaction(sc))
2707 return false;
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719 if (!nr_reclaimed)
2720 return false;
2721
2722
2723 for (z = 0; z <= sc->reclaim_idx; z++) {
2724 struct zone *zone = &pgdat->node_zones[z];
2725 if (!managed_zone(zone))
2726 continue;
2727
2728 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2729 case COMPACT_SUCCESS:
2730 case COMPACT_CONTINUE:
2731 return false;
2732 default:
2733
2734 ;
2735 }
2736 }
2737
2738
2739
2740
2741
2742 pages_for_compaction = compact_gap(sc->order);
2743 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2744 if (get_nr_swap_pages() > 0)
2745 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2746
2747 return inactive_lru_pages > pages_for_compaction;
2748 }
2749
2750 static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
2751 {
2752 return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
2753 (memcg && memcg_congested(pgdat, memcg));
2754 }
2755
2756 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2757 {
2758 struct reclaim_state *reclaim_state = current->reclaim_state;
2759 unsigned long nr_reclaimed, nr_scanned;
2760 bool reclaimable = false;
2761
2762 do {
2763 struct mem_cgroup *root = sc->target_mem_cgroup;
2764 unsigned long node_lru_pages = 0;
2765 struct mem_cgroup *memcg;
2766
2767 memset(&sc->nr, 0, sizeof(sc->nr));
2768
2769 nr_reclaimed = sc->nr_reclaimed;
2770 nr_scanned = sc->nr_scanned;
2771
2772 memcg = mem_cgroup_iter(root, NULL, NULL);
2773 do {
2774 unsigned long lru_pages;
2775 unsigned long reclaimed;
2776 unsigned long scanned;
2777
2778 switch (mem_cgroup_protected(root, memcg)) {
2779 case MEMCG_PROT_MIN:
2780
2781
2782
2783
2784 continue;
2785 case MEMCG_PROT_LOW:
2786
2787
2788
2789
2790
2791
2792 if (!sc->memcg_low_reclaim) {
2793 sc->memcg_low_skipped = 1;
2794 continue;
2795 }
2796 memcg_memory_event(memcg, MEMCG_LOW);
2797 break;
2798 case MEMCG_PROT_NONE:
2799
2800
2801
2802
2803
2804
2805
2806 break;
2807 }
2808
2809 reclaimed = sc->nr_reclaimed;
2810 scanned = sc->nr_scanned;
2811 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2812 node_lru_pages += lru_pages;
2813
2814 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
2815 sc->priority);
2816
2817
2818 vmpressure(sc->gfp_mask, memcg, false,
2819 sc->nr_scanned - scanned,
2820 sc->nr_reclaimed - reclaimed);
2821
2822 } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
2823
2824 if (reclaim_state) {
2825 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2826 reclaim_state->reclaimed_slab = 0;
2827 }
2828
2829
2830 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2831 sc->nr_scanned - nr_scanned,
2832 sc->nr_reclaimed - nr_reclaimed);
2833
2834 if (sc->nr_reclaimed - nr_reclaimed)
2835 reclaimable = true;
2836
2837 if (current_is_kswapd()) {
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2856 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2857
2858
2859
2860
2861
2862
2863 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2864 set_bit(PGDAT_CONGESTED, &pgdat->flags);
2865
2866
2867 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2868 set_bit(PGDAT_DIRTY, &pgdat->flags);
2869
2870
2871
2872
2873
2874
2875
2876 if (sc->nr.immediate)
2877 congestion_wait(BLK_RW_ASYNC, HZ/10);
2878 }
2879
2880
2881
2882
2883
2884 if (!global_reclaim(sc) && sane_reclaim(sc) &&
2885 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2886 set_memcg_congestion(pgdat, root, true);
2887
2888
2889
2890
2891
2892
2893
2894 if (!sc->hibernation_mode && !current_is_kswapd() &&
2895 current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2896 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2897
2898 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2899 sc));
2900
2901
2902
2903
2904
2905
2906
2907 if (reclaimable)
2908 pgdat->kswapd_failures = 0;
2909
2910 return reclaimable;
2911 }
2912
2913
2914
2915
2916
2917
2918 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2919 {
2920 unsigned long watermark;
2921 enum compact_result suitable;
2922
2923 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2924 if (suitable == COMPACT_SUCCESS)
2925
2926 return true;
2927 if (suitable == COMPACT_SKIPPED)
2928
2929 return false;
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2941
2942 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2943 }
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2954 {
2955 struct zoneref *z;
2956 struct zone *zone;
2957 unsigned long nr_soft_reclaimed;
2958 unsigned long nr_soft_scanned;
2959 gfp_t orig_mask;
2960 pg_data_t *last_pgdat = NULL;
2961
2962
2963
2964
2965
2966
2967 orig_mask = sc->gfp_mask;
2968 if (buffer_heads_over_limit) {
2969 sc->gfp_mask |= __GFP_HIGHMEM;
2970 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2971 }
2972
2973 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2974 sc->reclaim_idx, sc->nodemask) {
2975
2976
2977
2978
2979 if (global_reclaim(sc)) {
2980 if (!cpuset_zone_allowed(zone,
2981 GFP_KERNEL | __GFP_HARDWALL))
2982 continue;
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993 if (IS_ENABLED(CONFIG_COMPACTION) &&
2994 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2995 compaction_ready(zone, sc)) {
2996 sc->compaction_ready = true;
2997 continue;
2998 }
2999
3000
3001
3002
3003
3004
3005
3006 if (zone->zone_pgdat == last_pgdat)
3007 continue;
3008
3009
3010
3011
3012
3013
3014
3015 nr_soft_scanned = 0;
3016 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
3017 sc->order, sc->gfp_mask,
3018 &nr_soft_scanned);
3019 sc->nr_reclaimed += nr_soft_reclaimed;
3020 sc->nr_scanned += nr_soft_scanned;
3021
3022 }
3023
3024
3025 if (zone->zone_pgdat == last_pgdat)
3026 continue;
3027 last_pgdat = zone->zone_pgdat;
3028 shrink_node(zone->zone_pgdat, sc);
3029 }
3030
3031
3032
3033
3034
3035 sc->gfp_mask = orig_mask;
3036 }
3037
3038 static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
3039 {
3040 struct mem_cgroup *memcg;
3041
3042 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
3043 do {
3044 unsigned long refaults;
3045 struct lruvec *lruvec;
3046
3047 lruvec = mem_cgroup_lruvec(pgdat, memcg);
3048 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
3049 lruvec->refaults = refaults;
3050 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
3051 }
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3070 struct scan_control *sc)
3071 {
3072 int initial_priority = sc->priority;
3073 pg_data_t *last_pgdat;
3074 struct zoneref *z;
3075 struct zone *zone;
3076 retry:
3077 delayacct_freepages_start();
3078
3079 if (global_reclaim(sc))
3080 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3081
3082 do {
3083 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3084 sc->priority);
3085 sc->nr_scanned = 0;
3086 shrink_zones(zonelist, sc);
3087
3088 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3089 break;
3090
3091 if (sc->compaction_ready)
3092 break;
3093
3094
3095
3096
3097
3098 if (sc->priority < DEF_PRIORITY - 2)
3099 sc->may_writepage = 1;
3100 } while (--sc->priority >= 0);
3101
3102 last_pgdat = NULL;
3103 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3104 sc->nodemask) {
3105 if (zone->zone_pgdat == last_pgdat)
3106 continue;
3107 last_pgdat = zone->zone_pgdat;
3108 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3109 set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
3110 }
3111
3112 delayacct_freepages_end();
3113
3114 if (sc->nr_reclaimed)
3115 return sc->nr_reclaimed;
3116
3117
3118 if (sc->compaction_ready)
3119 return 1;
3120
3121
3122 if (sc->memcg_low_skipped) {
3123 sc->priority = initial_priority;
3124 sc->memcg_low_reclaim = 1;
3125 sc->memcg_low_skipped = 0;
3126 goto retry;
3127 }
3128
3129 return 0;
3130 }
3131
3132 static bool allow_direct_reclaim(pg_data_t *pgdat)
3133 {
3134 struct zone *zone;
3135 unsigned long pfmemalloc_reserve = 0;
3136 unsigned long free_pages = 0;
3137 int i;
3138 bool wmark_ok;
3139
3140 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3141 return true;
3142
3143 for (i = 0; i <= ZONE_NORMAL; i++) {
3144 zone = &pgdat->node_zones[i];
3145 if (!managed_zone(zone))
3146 continue;
3147
3148 if (!zone_reclaimable_pages(zone))
3149 continue;
3150
3151 pfmemalloc_reserve += min_wmark_pages(zone);
3152 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3153 }
3154
3155
3156 if (!pfmemalloc_reserve)
3157 return true;
3158
3159 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3160
3161
3162 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3163 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3164 (enum zone_type)ZONE_NORMAL);
3165 wake_up_interruptible(&pgdat->kswapd_wait);
3166 }
3167
3168 return wmark_ok;
3169 }
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3181 nodemask_t *nodemask)
3182 {
3183 struct zoneref *z;
3184 struct zone *zone;
3185 pg_data_t *pgdat = NULL;
3186
3187
3188
3189
3190
3191
3192
3193
3194 if (current->flags & PF_KTHREAD)
3195 goto out;
3196
3197
3198
3199
3200
3201 if (fatal_signal_pending(current))
3202 goto out;
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3219 gfp_zone(gfp_mask), nodemask) {
3220 if (zone_idx(zone) > ZONE_NORMAL)
3221 continue;
3222
3223
3224 pgdat = zone->zone_pgdat;
3225 if (allow_direct_reclaim(pgdat))
3226 goto out;
3227 break;
3228 }
3229
3230
3231 if (!pgdat)
3232 goto out;
3233
3234
3235 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245 if (!(gfp_mask & __GFP_FS)) {
3246 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3247 allow_direct_reclaim(pgdat), HZ);
3248
3249 goto check_pending;
3250 }
3251
3252
3253 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3254 allow_direct_reclaim(pgdat));
3255
3256 check_pending:
3257 if (fatal_signal_pending(current))
3258 return true;
3259
3260 out:
3261 return false;
3262 }
3263
3264 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3265 gfp_t gfp_mask, nodemask_t *nodemask)
3266 {
3267 unsigned long nr_reclaimed;
3268 struct scan_control sc = {
3269 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3270 .gfp_mask = current_gfp_context(gfp_mask),
3271 .reclaim_idx = gfp_zone(gfp_mask),
3272 .order = order,
3273 .nodemask = nodemask,
3274 .priority = DEF_PRIORITY,
3275 .may_writepage = !laptop_mode,
3276 .may_unmap = 1,
3277 .may_swap = 1,
3278 };
3279
3280
3281
3282
3283
3284 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3285 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3286 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3287
3288
3289
3290
3291
3292
3293 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3294 return 1;
3295
3296 set_task_reclaim_state(current, &sc.reclaim_state);
3297 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3298
3299 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3300
3301 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3302 set_task_reclaim_state(current, NULL);
3303
3304 return nr_reclaimed;
3305 }
3306
3307 #ifdef CONFIG_MEMCG
3308
3309
3310 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3311 gfp_t gfp_mask, bool noswap,
3312 pg_data_t *pgdat,
3313 unsigned long *nr_scanned)
3314 {
3315 struct scan_control sc = {
3316 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3317 .target_mem_cgroup = memcg,
3318 .may_writepage = !laptop_mode,
3319 .may_unmap = 1,
3320 .reclaim_idx = MAX_NR_ZONES - 1,
3321 .may_swap = !noswap,
3322 };
3323 unsigned long lru_pages;
3324
3325 WARN_ON_ONCE(!current->reclaim_state);
3326
3327 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3328 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3329
3330 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3331 sc.gfp_mask);
3332
3333
3334
3335
3336
3337
3338
3339
3340 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3341
3342 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3343
3344 *nr_scanned = sc.nr_scanned;
3345
3346 return sc.nr_reclaimed;
3347 }
3348
3349 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3350 unsigned long nr_pages,
3351 gfp_t gfp_mask,
3352 bool may_swap)
3353 {
3354 struct zonelist *zonelist;
3355 unsigned long nr_reclaimed;
3356 unsigned long pflags;
3357 int nid;
3358 unsigned int noreclaim_flag;
3359 struct scan_control sc = {
3360 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3361 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3362 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3363 .reclaim_idx = MAX_NR_ZONES - 1,
3364 .target_mem_cgroup = memcg,
3365 .priority = DEF_PRIORITY,
3366 .may_writepage = !laptop_mode,
3367 .may_unmap = 1,
3368 .may_swap = may_swap,
3369 };
3370
3371 set_task_reclaim_state(current, &sc.reclaim_state);
3372
3373
3374
3375
3376
3377 nid = mem_cgroup_select_victim_node(memcg);
3378
3379 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3380
3381 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3382
3383 psi_memstall_enter(&pflags);
3384 noreclaim_flag = memalloc_noreclaim_save();
3385
3386 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3387
3388 memalloc_noreclaim_restore(noreclaim_flag);
3389 psi_memstall_leave(&pflags);
3390
3391 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3392 set_task_reclaim_state(current, NULL);
3393
3394 return nr_reclaimed;
3395 }
3396 #endif
3397
3398 static void age_active_anon(struct pglist_data *pgdat,
3399 struct scan_control *sc)
3400 {
3401 struct mem_cgroup *memcg;
3402
3403 if (!total_swap_pages)
3404 return;
3405
3406 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3407 do {
3408 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3409
3410 if (inactive_list_is_low(lruvec, false, sc, true))
3411 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3412 sc, LRU_ACTIVE_ANON);
3413
3414 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3415 } while (memcg);
3416 }
3417
3418 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
3419 {
3420 int i;
3421 struct zone *zone;
3422
3423
3424
3425
3426
3427
3428
3429
3430 for (i = classzone_idx; i >= 0; i--) {
3431 zone = pgdat->node_zones + i;
3432 if (!managed_zone(zone))
3433 continue;
3434
3435 if (zone->watermark_boost)
3436 return true;
3437 }
3438
3439 return false;
3440 }
3441
3442
3443
3444
3445
3446 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3447 {
3448 int i;
3449 unsigned long mark = -1;
3450 struct zone *zone;
3451
3452
3453
3454
3455
3456 for (i = 0; i <= classzone_idx; i++) {
3457 zone = pgdat->node_zones + i;
3458
3459 if (!managed_zone(zone))
3460 continue;
3461
3462 mark = high_wmark_pages(zone);
3463 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3464 return true;
3465 }
3466
3467
3468
3469
3470
3471
3472 if (mark == -1)
3473 return true;
3474
3475 return false;
3476 }
3477
3478
3479 static void clear_pgdat_congested(pg_data_t *pgdat)
3480 {
3481 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3482 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3483 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3484 }
3485
3486
3487
3488
3489
3490
3491
3492 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3493 {
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3508 wake_up_all(&pgdat->pfmemalloc_wait);
3509
3510
3511 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3512 return true;
3513
3514 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3515 clear_pgdat_congested(pgdat);
3516 return true;
3517 }
3518
3519 return false;
3520 }
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530 static bool kswapd_shrink_node(pg_data_t *pgdat,
3531 struct scan_control *sc)
3532 {
3533 struct zone *zone;
3534 int z;
3535
3536
3537 sc->nr_to_reclaim = 0;
3538 for (z = 0; z <= sc->reclaim_idx; z++) {
3539 zone = pgdat->node_zones + z;
3540 if (!managed_zone(zone))
3541 continue;
3542
3543 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3544 }
3545
3546
3547
3548
3549
3550 shrink_node(pgdat, sc);
3551
3552
3553
3554
3555
3556
3557
3558
3559 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3560 sc->order = 0;
3561
3562 return sc->nr_scanned >= sc->nr_to_reclaim;
3563 }
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578 static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3579 {
3580 int i;
3581 unsigned long nr_soft_reclaimed;
3582 unsigned long nr_soft_scanned;
3583 unsigned long pflags;
3584 unsigned long nr_boost_reclaim;
3585 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3586 bool boosted;
3587 struct zone *zone;
3588 struct scan_control sc = {
3589 .gfp_mask = GFP_KERNEL,
3590 .order = order,
3591 .may_unmap = 1,
3592 };
3593
3594 set_task_reclaim_state(current, &sc.reclaim_state);
3595 psi_memstall_enter(&pflags);
3596 __fs_reclaim_acquire();
3597
3598 count_vm_event(PAGEOUTRUN);
3599
3600
3601
3602
3603
3604
3605 nr_boost_reclaim = 0;
3606 for (i = 0; i <= classzone_idx; i++) {
3607 zone = pgdat->node_zones + i;
3608 if (!managed_zone(zone))
3609 continue;
3610
3611 nr_boost_reclaim += zone->watermark_boost;
3612 zone_boosts[i] = zone->watermark_boost;
3613 }
3614 boosted = nr_boost_reclaim;
3615
3616 restart:
3617 sc.priority = DEF_PRIORITY;
3618 do {
3619 unsigned long nr_reclaimed = sc.nr_reclaimed;
3620 bool raise_priority = true;
3621 bool balanced;
3622 bool ret;
3623
3624 sc.reclaim_idx = classzone_idx;
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636 if (buffer_heads_over_limit) {
3637 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3638 zone = pgdat->node_zones + i;
3639 if (!managed_zone(zone))
3640 continue;
3641
3642 sc.reclaim_idx = i;
3643 break;
3644 }
3645 }
3646
3647
3648
3649
3650
3651
3652
3653
3654 balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
3655 if (!balanced && nr_boost_reclaim) {
3656 nr_boost_reclaim = 0;
3657 goto restart;
3658 }
3659
3660
3661
3662
3663
3664
3665 if (!nr_boost_reclaim && balanced)
3666 goto out;
3667
3668
3669 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3670 raise_priority = false;
3671
3672
3673
3674
3675
3676
3677
3678 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3679 sc.may_swap = !nr_boost_reclaim;
3680
3681
3682
3683
3684
3685
3686
3687 age_active_anon(pgdat, &sc);
3688
3689
3690
3691
3692
3693 if (sc.priority < DEF_PRIORITY - 2)
3694 sc.may_writepage = 1;
3695
3696
3697 sc.nr_scanned = 0;
3698 nr_soft_scanned = 0;
3699 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3700 sc.gfp_mask, &nr_soft_scanned);
3701 sc.nr_reclaimed += nr_soft_reclaimed;
3702
3703
3704
3705
3706
3707
3708 if (kswapd_shrink_node(pgdat, &sc))
3709 raise_priority = false;
3710
3711
3712
3713
3714
3715
3716 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3717 allow_direct_reclaim(pgdat))
3718 wake_up_all(&pgdat->pfmemalloc_wait);
3719
3720
3721 __fs_reclaim_release();
3722 ret = try_to_freeze();
3723 __fs_reclaim_acquire();
3724 if (ret || kthread_should_stop())
3725 break;
3726
3727
3728
3729
3730
3731 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3732 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3733
3734
3735
3736
3737
3738
3739 if (nr_boost_reclaim && !nr_reclaimed)
3740 break;
3741
3742 if (raise_priority || !nr_reclaimed)
3743 sc.priority--;
3744 } while (sc.priority >= 1);
3745
3746 if (!sc.nr_reclaimed)
3747 pgdat->kswapd_failures++;
3748
3749 out:
3750
3751 if (boosted) {
3752 unsigned long flags;
3753
3754 for (i = 0; i <= classzone_idx; i++) {
3755 if (!zone_boosts[i])
3756 continue;
3757
3758
3759 zone = pgdat->node_zones + i;
3760 spin_lock_irqsave(&zone->lock, flags);
3761 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3762 spin_unlock_irqrestore(&zone->lock, flags);
3763 }
3764
3765
3766
3767
3768
3769 wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
3770 }
3771
3772 snapshot_refaults(NULL, pgdat);
3773 __fs_reclaim_release();
3774 psi_memstall_leave(&pflags);
3775 set_task_reclaim_state(current, NULL);
3776
3777
3778
3779
3780
3781
3782
3783 return sc.order;
3784 }
3785
3786
3787
3788
3789
3790
3791
3792
3793 static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3794 enum zone_type prev_classzone_idx)
3795 {
3796 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3797 return prev_classzone_idx;
3798 return pgdat->kswapd_classzone_idx;
3799 }
3800
3801 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3802 unsigned int classzone_idx)
3803 {
3804 long remaining = 0;
3805 DEFINE_WAIT(wait);
3806
3807 if (freezing(current) || kthread_should_stop())
3808 return;
3809
3810 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3811
3812
3813
3814
3815
3816
3817
3818
3819 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3820
3821
3822
3823
3824
3825
3826 reset_isolation_suitable(pgdat);
3827
3828
3829
3830
3831
3832 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3833
3834 remaining = schedule_timeout(HZ/10);
3835
3836
3837
3838
3839
3840
3841 if (remaining) {
3842 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3843 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3844 }
3845
3846 finish_wait(&pgdat->kswapd_wait, &wait);
3847 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3848 }
3849
3850
3851
3852
3853
3854 if (!remaining &&
3855 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3856 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3867
3868 if (!kthread_should_stop())
3869 schedule();
3870
3871 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3872 } else {
3873 if (remaining)
3874 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3875 else
3876 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3877 }
3878 finish_wait(&pgdat->kswapd_wait, &wait);
3879 }
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894 static int kswapd(void *p)
3895 {
3896 unsigned int alloc_order, reclaim_order;
3897 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3898 pg_data_t *pgdat = (pg_data_t*)p;
3899 struct task_struct *tsk = current;
3900 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3901
3902 if (!cpumask_empty(cpumask))
3903 set_cpus_allowed_ptr(tsk, cpumask);
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3918 set_freezable();
3919
3920 pgdat->kswapd_order = 0;
3921 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3922 for ( ; ; ) {
3923 bool ret;
3924
3925 alloc_order = reclaim_order = pgdat->kswapd_order;
3926 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3927
3928 kswapd_try_sleep:
3929 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3930 classzone_idx);
3931
3932
3933 alloc_order = reclaim_order = pgdat->kswapd_order;
3934 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3935 pgdat->kswapd_order = 0;
3936 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3937
3938 ret = try_to_freeze();
3939 if (kthread_should_stop())
3940 break;
3941
3942
3943
3944
3945
3946 if (ret)
3947 continue;
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3958 alloc_order);
3959 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3960 if (reclaim_order < alloc_order)
3961 goto kswapd_try_sleep;
3962 }
3963
3964 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3965
3966 return 0;
3967 }
3968
3969
3970
3971
3972
3973
3974
3975
3976 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3977 enum zone_type classzone_idx)
3978 {
3979 pg_data_t *pgdat;
3980
3981 if (!managed_zone(zone))
3982 return;
3983
3984 if (!cpuset_zone_allowed(zone, gfp_flags))
3985 return;
3986 pgdat = zone->zone_pgdat;
3987
3988 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3989 pgdat->kswapd_classzone_idx = classzone_idx;
3990 else
3991 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
3992 classzone_idx);
3993 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3994 if (!waitqueue_active(&pgdat->kswapd_wait))
3995 return;
3996
3997
3998 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3999 (pgdat_balanced(pgdat, order, classzone_idx) &&
4000 !pgdat_watermark_boosted(pgdat, classzone_idx))) {
4001
4002
4003
4004
4005
4006
4007
4008 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
4009 wakeup_kcompactd(pgdat, order, classzone_idx);
4010 return;
4011 }
4012
4013 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
4014 gfp_flags);
4015 wake_up_interruptible(&pgdat->kswapd_wait);
4016 }
4017
4018 #ifdef CONFIG_HIBERNATION
4019
4020
4021
4022
4023
4024
4025
4026
4027 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4028 {
4029 struct scan_control sc = {
4030 .nr_to_reclaim = nr_to_reclaim,
4031 .gfp_mask = GFP_HIGHUSER_MOVABLE,
4032 .reclaim_idx = MAX_NR_ZONES - 1,
4033 .priority = DEF_PRIORITY,
4034 .may_writepage = 1,
4035 .may_unmap = 1,
4036 .may_swap = 1,
4037 .hibernation_mode = 1,
4038 };
4039 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4040 unsigned long nr_reclaimed;
4041 unsigned int noreclaim_flag;
4042
4043 fs_reclaim_acquire(sc.gfp_mask);
4044 noreclaim_flag = memalloc_noreclaim_save();
4045 set_task_reclaim_state(current, &sc.reclaim_state);
4046
4047 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4048
4049 set_task_reclaim_state(current, NULL);
4050 memalloc_noreclaim_restore(noreclaim_flag);
4051 fs_reclaim_release(sc.gfp_mask);
4052
4053 return nr_reclaimed;
4054 }
4055 #endif
4056
4057
4058
4059
4060
4061 static int kswapd_cpu_online(unsigned int cpu)
4062 {
4063 int nid;
4064
4065 for_each_node_state(nid, N_MEMORY) {
4066 pg_data_t *pgdat = NODE_DATA(nid);
4067 const struct cpumask *mask;
4068
4069 mask = cpumask_of_node(pgdat->node_id);
4070
4071 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
4072
4073 set_cpus_allowed_ptr(pgdat->kswapd, mask);
4074 }
4075 return 0;
4076 }
4077
4078
4079
4080
4081
4082 int kswapd_run(int nid)
4083 {
4084 pg_data_t *pgdat = NODE_DATA(nid);
4085 int ret = 0;
4086
4087 if (pgdat->kswapd)
4088 return 0;
4089
4090 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4091 if (IS_ERR(pgdat->kswapd)) {
4092
4093 BUG_ON(system_state < SYSTEM_RUNNING);
4094 pr_err("Failed to start kswapd on node %d\n", nid);
4095 ret = PTR_ERR(pgdat->kswapd);
4096 pgdat->kswapd = NULL;
4097 }
4098 return ret;
4099 }
4100
4101
4102
4103
4104
4105 void kswapd_stop(int nid)
4106 {
4107 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4108
4109 if (kswapd) {
4110 kthread_stop(kswapd);
4111 NODE_DATA(nid)->kswapd = NULL;
4112 }
4113 }
4114
4115 static int __init kswapd_init(void)
4116 {
4117 int nid, ret;
4118
4119 swap_setup();
4120 for_each_node_state(nid, N_MEMORY)
4121 kswapd_run(nid);
4122 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
4123 "mm/vmscan:online", kswapd_cpu_online,
4124 NULL);
4125 WARN_ON(ret < 0);
4126 return 0;
4127 }
4128
4129 module_init(kswapd_init)
4130
4131 #ifdef CONFIG_NUMA
4132
4133
4134
4135
4136
4137
4138 int node_reclaim_mode __read_mostly;
4139
4140 #define RECLAIM_OFF 0
4141 #define RECLAIM_ZONE (1<<0)
4142 #define RECLAIM_WRITE (1<<1)
4143 #define RECLAIM_UNMAP (1<<2)
4144
4145
4146
4147
4148
4149
4150 #define NODE_RECLAIM_PRIORITY 4
4151
4152
4153
4154
4155
4156 int sysctl_min_unmapped_ratio = 1;
4157
4158
4159
4160
4161
4162 int sysctl_min_slab_ratio = 5;
4163
4164 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4165 {
4166 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4167 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4168 node_page_state(pgdat, NR_ACTIVE_FILE);
4169
4170
4171
4172
4173
4174
4175 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4176 }
4177
4178
4179 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4180 {
4181 unsigned long nr_pagecache_reclaimable;
4182 unsigned long delta = 0;
4183
4184
4185
4186
4187
4188
4189
4190 if (node_reclaim_mode & RECLAIM_UNMAP)
4191 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4192 else
4193 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4194
4195
4196 if (!(node_reclaim_mode & RECLAIM_WRITE))
4197 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4198
4199
4200 if (unlikely(delta > nr_pagecache_reclaimable))
4201 delta = nr_pagecache_reclaimable;
4202
4203 return nr_pagecache_reclaimable - delta;
4204 }
4205
4206
4207
4208
4209 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4210 {
4211
4212 const unsigned long nr_pages = 1 << order;
4213 struct task_struct *p = current;
4214 unsigned int noreclaim_flag;
4215 struct scan_control sc = {
4216 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4217 .gfp_mask = current_gfp_context(gfp_mask),
4218 .order = order,
4219 .priority = NODE_RECLAIM_PRIORITY,
4220 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4221 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4222 .may_swap = 1,
4223 .reclaim_idx = gfp_zone(gfp_mask),
4224 };
4225
4226 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4227 sc.gfp_mask);
4228
4229 cond_resched();
4230 fs_reclaim_acquire(sc.gfp_mask);
4231
4232
4233
4234
4235
4236 noreclaim_flag = memalloc_noreclaim_save();
4237 p->flags |= PF_SWAPWRITE;
4238 set_task_reclaim_state(p, &sc.reclaim_state);
4239
4240 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4241
4242
4243
4244
4245 do {
4246 shrink_node(pgdat, &sc);
4247 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4248 }
4249
4250 set_task_reclaim_state(p, NULL);
4251 current->flags &= ~PF_SWAPWRITE;
4252 memalloc_noreclaim_restore(noreclaim_flag);
4253 fs_reclaim_release(sc.gfp_mask);
4254
4255 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4256
4257 return sc.nr_reclaimed >= nr_pages;
4258 }
4259
4260 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4261 {
4262 int ret;
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4275 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4276 return NODE_RECLAIM_FULL;
4277
4278
4279
4280
4281 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4282 return NODE_RECLAIM_NOSCAN;
4283
4284
4285
4286
4287
4288
4289
4290 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4291 return NODE_RECLAIM_NOSCAN;
4292
4293 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4294 return NODE_RECLAIM_NOSCAN;
4295
4296 ret = __node_reclaim(pgdat, gfp_mask, order);
4297 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4298
4299 if (!ret)
4300 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4301
4302 return ret;
4303 }
4304 #endif
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318 int page_evictable(struct page *page)
4319 {
4320 int ret;
4321
4322
4323 rcu_read_lock();
4324 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4325 rcu_read_unlock();
4326 return ret;
4327 }
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338 void check_move_unevictable_pages(struct pagevec *pvec)
4339 {
4340 struct lruvec *lruvec;
4341 struct pglist_data *pgdat = NULL;
4342 int pgscanned = 0;
4343 int pgrescued = 0;
4344 int i;
4345
4346 for (i = 0; i < pvec->nr; i++) {
4347 struct page *page = pvec->pages[i];
4348 struct pglist_data *pagepgdat = page_pgdat(page);
4349
4350 pgscanned++;
4351 if (pagepgdat != pgdat) {
4352 if (pgdat)
4353 spin_unlock_irq(&pgdat->lru_lock);
4354 pgdat = pagepgdat;
4355 spin_lock_irq(&pgdat->lru_lock);
4356 }
4357 lruvec = mem_cgroup_page_lruvec(page, pgdat);
4358
4359 if (!PageLRU(page) || !PageUnevictable(page))
4360 continue;
4361
4362 if (page_evictable(page)) {
4363 enum lru_list lru = page_lru_base_type(page);
4364
4365 VM_BUG_ON_PAGE(PageActive(page), page);
4366 ClearPageUnevictable(page);
4367 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4368 add_page_to_lru_list(page, lruvec, lru);
4369 pgrescued++;
4370 }
4371 }
4372
4373 if (pgdat) {
4374 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4375 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4376 spin_unlock_irq(&pgdat->lru_lock);
4377 }
4378 }
4379 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);