This source file includes following definitions.
- swap_type_to_swap_info
- swap_count
- __try_to_reclaim_swap
- first_se
- next_se
- discard_swap
- offset_to_swap_extent
- discard_swap_cluster
- cluster_set_flag
- cluster_count
- cluster_set_count
- cluster_set_count_flag
- cluster_next
- cluster_set_next
- cluster_set_next_flag
- cluster_is_free
- cluster_is_null
- cluster_set_null
- cluster_is_huge
- cluster_clear_huge
- lock_cluster
- unlock_cluster
- lock_cluster_or_swap_info
- unlock_cluster_or_swap_info
- cluster_list_empty
- cluster_list_first
- cluster_list_init
- cluster_list_add_tail
- cluster_list_del_first
- swap_cluster_schedule_discard
- __free_cluster
- swap_do_scheduled_discard
- swap_discard_work
- alloc_cluster
- free_cluster
- inc_cluster_info_page
- dec_cluster_info_page
- scan_swap_map_ssd_cluster_conflict
- scan_swap_map_try_ssd_cluster
- __del_from_avail_list
- del_from_avail_list
- swap_range_alloc
- add_to_avail_list
- swap_range_free
- scan_swap_map_slots
- swap_alloc_cluster
- swap_free_cluster
- scan_swap_map
- get_swap_pages
- get_swap_page_of_type
- __swap_info_get
- _swap_info_get
- swap_info_get
- swap_info_get_cont
- __swap_entry_free_locked
- get_swap_device
- __swap_entry_free
- swap_entry_free
- swap_free
- put_swap_page
- split_swap_cluster
- swp_entry_cmp
- swapcache_free_entries
- page_swapcount
- __swap_count
- swap_swapcount
- __swp_swapcount
- swp_swapcount
- swap_page_trans_huge_swapped
- page_swapped
- page_trans_huge_map_swapcount
- reuse_swap_page
- try_to_free_swap
- free_swap_and_cache
- swap_type_of
- swapdev_block
- count_swap_pages
- pte_same_as_swp
- unuse_pte
- unuse_pte_range
- unuse_pmd_range
- unuse_pud_range
- unuse_p4d_range
- unuse_vma
- unuse_mm
- find_next_to_unuse
- try_to_unuse
- drain_mmlist
- map_swap_entry
- map_swap_page
- destroy_swap_extents
- add_swap_extent
- setup_swap_extents
- swap_node
- setup_swap_info
- _enable_swap_info
- enable_swap_info
- reinsert_swap_info
- has_usable_swap
- SYSCALL_DEFINE1
- swaps_poll
- swap_start
- swap_next
- swap_stop
- swap_show
- swaps_open
- procswaps_init
- max_swapfiles_check
- alloc_swap_info
- claim_swapfile
- generic_max_swapfile_size
- max_swapfile_size
- read_swap_header
- setup_swap_map_and_extents
- swap_discardable
- SYSCALL_DEFINE2
- si_swapinfo
- __swap_duplicate
- swap_shmem_alloc
- swap_duplicate
- swapcache_prepare
- swp_swap_info
- page_swap_info
- __page_file_mapping
- __page_file_index
- add_swap_count_continuation
- swap_count_continued
- free_swap_count_continuations
- mem_cgroup_throttle_swaprate
- swapfile_init
1
2
3
4
5
6
7
8
9 #include <linux/mm.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/task.h>
12 #include <linux/hugetlb.h>
13 #include <linux/mman.h>
14 #include <linux/slab.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/swap.h>
17 #include <linux/vmalloc.h>
18 #include <linux/pagemap.h>
19 #include <linux/namei.h>
20 #include <linux/shmem_fs.h>
21 #include <linux/blkdev.h>
22 #include <linux/random.h>
23 #include <linux/writeback.h>
24 #include <linux/proc_fs.h>
25 #include <linux/seq_file.h>
26 #include <linux/init.h>
27 #include <linux/ksm.h>
28 #include <linux/rmap.h>
29 #include <linux/security.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mutex.h>
32 #include <linux/capability.h>
33 #include <linux/syscalls.h>
34 #include <linux/memcontrol.h>
35 #include <linux/poll.h>
36 #include <linux/oom.h>
37 #include <linux/frontswap.h>
38 #include <linux/swapfile.h>
39 #include <linux/export.h>
40 #include <linux/swap_slots.h>
41 #include <linux/sort.h>
42
43 #include <asm/pgtable.h>
44 #include <asm/tlbflush.h>
45 #include <linux/swapops.h>
46 #include <linux/swap_cgroup.h>
47
48 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
49 unsigned char);
50 static void free_swap_count_continuations(struct swap_info_struct *);
51 static sector_t map_swap_entry(swp_entry_t, struct block_device**);
52
53 DEFINE_SPINLOCK(swap_lock);
54 static unsigned int nr_swapfiles;
55 atomic_long_t nr_swap_pages;
56
57
58
59
60
61 EXPORT_SYMBOL_GPL(nr_swap_pages);
62
63 long total_swap_pages;
64 static int least_priority = -1;
65
66 static const char Bad_file[] = "Bad swap file entry ";
67 static const char Unused_file[] = "Unused swap file entry ";
68 static const char Bad_offset[] = "Bad swap offset entry ";
69 static const char Unused_offset[] = "Unused swap offset entry ";
70
71
72
73
74
75 PLIST_HEAD(swap_active_head);
76
77
78
79
80
81
82
83
84
85
86
87
88
89 static struct plist_head *swap_avail_heads;
90 static DEFINE_SPINLOCK(swap_avail_lock);
91
92 struct swap_info_struct *swap_info[MAX_SWAPFILES];
93
94 static DEFINE_MUTEX(swapon_mutex);
95
96 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
97
98 static atomic_t proc_poll_event = ATOMIC_INIT(0);
99
100 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
101
102 static struct swap_info_struct *swap_type_to_swap_info(int type)
103 {
104 if (type >= READ_ONCE(nr_swapfiles))
105 return NULL;
106
107 smp_rmb();
108 return READ_ONCE(swap_info[type]);
109 }
110
111 static inline unsigned char swap_count(unsigned char ent)
112 {
113 return ent & ~SWAP_HAS_CACHE;
114 }
115
116
117 #define TTRS_ANYWAY 0x1
118
119
120
121
122 #define TTRS_UNMAPPED 0x2
123
124 #define TTRS_FULL 0x4
125
126
127 static int __try_to_reclaim_swap(struct swap_info_struct *si,
128 unsigned long offset, unsigned long flags)
129 {
130 swp_entry_t entry = swp_entry(si->type, offset);
131 struct page *page;
132 int ret = 0;
133
134 page = find_get_page(swap_address_space(entry), offset);
135 if (!page)
136 return 0;
137
138
139
140
141
142
143
144 if (trylock_page(page)) {
145 if ((flags & TTRS_ANYWAY) ||
146 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
147 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
148 ret = try_to_free_swap(page);
149 unlock_page(page);
150 }
151 put_page(page);
152 return ret;
153 }
154
155 static inline struct swap_extent *first_se(struct swap_info_struct *sis)
156 {
157 struct rb_node *rb = rb_first(&sis->swap_extent_root);
158 return rb_entry(rb, struct swap_extent, rb_node);
159 }
160
161 static inline struct swap_extent *next_se(struct swap_extent *se)
162 {
163 struct rb_node *rb = rb_next(&se->rb_node);
164 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
165 }
166
167
168
169
170
171 static int discard_swap(struct swap_info_struct *si)
172 {
173 struct swap_extent *se;
174 sector_t start_block;
175 sector_t nr_blocks;
176 int err = 0;
177
178
179 se = first_se(si);
180 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
181 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
182 if (nr_blocks) {
183 err = blkdev_issue_discard(si->bdev, start_block,
184 nr_blocks, GFP_KERNEL, 0);
185 if (err)
186 return err;
187 cond_resched();
188 }
189
190 for (se = next_se(se); se; se = next_se(se)) {
191 start_block = se->start_block << (PAGE_SHIFT - 9);
192 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
193
194 err = blkdev_issue_discard(si->bdev, start_block,
195 nr_blocks, GFP_KERNEL, 0);
196 if (err)
197 break;
198
199 cond_resched();
200 }
201 return err;
202 }
203
204 static struct swap_extent *
205 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
206 {
207 struct swap_extent *se;
208 struct rb_node *rb;
209
210 rb = sis->swap_extent_root.rb_node;
211 while (rb) {
212 se = rb_entry(rb, struct swap_extent, rb_node);
213 if (offset < se->start_page)
214 rb = rb->rb_left;
215 else if (offset >= se->start_page + se->nr_pages)
216 rb = rb->rb_right;
217 else
218 return se;
219 }
220
221 BUG();
222 }
223
224
225
226
227
228 static void discard_swap_cluster(struct swap_info_struct *si,
229 pgoff_t start_page, pgoff_t nr_pages)
230 {
231 struct swap_extent *se = offset_to_swap_extent(si, start_page);
232
233 while (nr_pages) {
234 pgoff_t offset = start_page - se->start_page;
235 sector_t start_block = se->start_block + offset;
236 sector_t nr_blocks = se->nr_pages - offset;
237
238 if (nr_blocks > nr_pages)
239 nr_blocks = nr_pages;
240 start_page += nr_blocks;
241 nr_pages -= nr_blocks;
242
243 start_block <<= PAGE_SHIFT - 9;
244 nr_blocks <<= PAGE_SHIFT - 9;
245 if (blkdev_issue_discard(si->bdev, start_block,
246 nr_blocks, GFP_NOIO, 0))
247 break;
248
249 se = next_se(se);
250 }
251 }
252
253 #ifdef CONFIG_THP_SWAP
254 #define SWAPFILE_CLUSTER HPAGE_PMD_NR
255
256 #define swap_entry_size(size) (size)
257 #else
258 #define SWAPFILE_CLUSTER 256
259
260
261
262
263
264 #define swap_entry_size(size) 1
265 #endif
266 #define LATENCY_LIMIT 256
267
268 static inline void cluster_set_flag(struct swap_cluster_info *info,
269 unsigned int flag)
270 {
271 info->flags = flag;
272 }
273
274 static inline unsigned int cluster_count(struct swap_cluster_info *info)
275 {
276 return info->data;
277 }
278
279 static inline void cluster_set_count(struct swap_cluster_info *info,
280 unsigned int c)
281 {
282 info->data = c;
283 }
284
285 static inline void cluster_set_count_flag(struct swap_cluster_info *info,
286 unsigned int c, unsigned int f)
287 {
288 info->flags = f;
289 info->data = c;
290 }
291
292 static inline unsigned int cluster_next(struct swap_cluster_info *info)
293 {
294 return info->data;
295 }
296
297 static inline void cluster_set_next(struct swap_cluster_info *info,
298 unsigned int n)
299 {
300 info->data = n;
301 }
302
303 static inline void cluster_set_next_flag(struct swap_cluster_info *info,
304 unsigned int n, unsigned int f)
305 {
306 info->flags = f;
307 info->data = n;
308 }
309
310 static inline bool cluster_is_free(struct swap_cluster_info *info)
311 {
312 return info->flags & CLUSTER_FLAG_FREE;
313 }
314
315 static inline bool cluster_is_null(struct swap_cluster_info *info)
316 {
317 return info->flags & CLUSTER_FLAG_NEXT_NULL;
318 }
319
320 static inline void cluster_set_null(struct swap_cluster_info *info)
321 {
322 info->flags = CLUSTER_FLAG_NEXT_NULL;
323 info->data = 0;
324 }
325
326 static inline bool cluster_is_huge(struct swap_cluster_info *info)
327 {
328 if (IS_ENABLED(CONFIG_THP_SWAP))
329 return info->flags & CLUSTER_FLAG_HUGE;
330 return false;
331 }
332
333 static inline void cluster_clear_huge(struct swap_cluster_info *info)
334 {
335 info->flags &= ~CLUSTER_FLAG_HUGE;
336 }
337
338 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
339 unsigned long offset)
340 {
341 struct swap_cluster_info *ci;
342
343 ci = si->cluster_info;
344 if (ci) {
345 ci += offset / SWAPFILE_CLUSTER;
346 spin_lock(&ci->lock);
347 }
348 return ci;
349 }
350
351 static inline void unlock_cluster(struct swap_cluster_info *ci)
352 {
353 if (ci)
354 spin_unlock(&ci->lock);
355 }
356
357
358
359
360
361 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
362 struct swap_info_struct *si, unsigned long offset)
363 {
364 struct swap_cluster_info *ci;
365
366
367 ci = lock_cluster(si, offset);
368
369 if (!ci)
370 spin_lock(&si->lock);
371
372 return ci;
373 }
374
375 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
376 struct swap_cluster_info *ci)
377 {
378 if (ci)
379 unlock_cluster(ci);
380 else
381 spin_unlock(&si->lock);
382 }
383
384 static inline bool cluster_list_empty(struct swap_cluster_list *list)
385 {
386 return cluster_is_null(&list->head);
387 }
388
389 static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
390 {
391 return cluster_next(&list->head);
392 }
393
394 static void cluster_list_init(struct swap_cluster_list *list)
395 {
396 cluster_set_null(&list->head);
397 cluster_set_null(&list->tail);
398 }
399
400 static void cluster_list_add_tail(struct swap_cluster_list *list,
401 struct swap_cluster_info *ci,
402 unsigned int idx)
403 {
404 if (cluster_list_empty(list)) {
405 cluster_set_next_flag(&list->head, idx, 0);
406 cluster_set_next_flag(&list->tail, idx, 0);
407 } else {
408 struct swap_cluster_info *ci_tail;
409 unsigned int tail = cluster_next(&list->tail);
410
411
412
413
414
415 ci_tail = ci + tail;
416 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
417 cluster_set_next(ci_tail, idx);
418 spin_unlock(&ci_tail->lock);
419 cluster_set_next_flag(&list->tail, idx, 0);
420 }
421 }
422
423 static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
424 struct swap_cluster_info *ci)
425 {
426 unsigned int idx;
427
428 idx = cluster_next(&list->head);
429 if (cluster_next(&list->tail) == idx) {
430 cluster_set_null(&list->head);
431 cluster_set_null(&list->tail);
432 } else
433 cluster_set_next_flag(&list->head,
434 cluster_next(&ci[idx]), 0);
435
436 return idx;
437 }
438
439
440 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
441 unsigned int idx)
442 {
443
444
445
446
447
448
449 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
450 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
451
452 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
453
454 schedule_work(&si->discard_work);
455 }
456
457 static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
458 {
459 struct swap_cluster_info *ci = si->cluster_info;
460
461 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
462 cluster_list_add_tail(&si->free_clusters, ci, idx);
463 }
464
465
466
467
468
469 static void swap_do_scheduled_discard(struct swap_info_struct *si)
470 {
471 struct swap_cluster_info *info, *ci;
472 unsigned int idx;
473
474 info = si->cluster_info;
475
476 while (!cluster_list_empty(&si->discard_clusters)) {
477 idx = cluster_list_del_first(&si->discard_clusters, info);
478 spin_unlock(&si->lock);
479
480 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
481 SWAPFILE_CLUSTER);
482
483 spin_lock(&si->lock);
484 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
485 __free_cluster(si, idx);
486 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
487 0, SWAPFILE_CLUSTER);
488 unlock_cluster(ci);
489 }
490 }
491
492 static void swap_discard_work(struct work_struct *work)
493 {
494 struct swap_info_struct *si;
495
496 si = container_of(work, struct swap_info_struct, discard_work);
497
498 spin_lock(&si->lock);
499 swap_do_scheduled_discard(si);
500 spin_unlock(&si->lock);
501 }
502
503 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
504 {
505 struct swap_cluster_info *ci = si->cluster_info;
506
507 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
508 cluster_list_del_first(&si->free_clusters, ci);
509 cluster_set_count_flag(ci + idx, 0, 0);
510 }
511
512 static void free_cluster(struct swap_info_struct *si, unsigned long idx)
513 {
514 struct swap_cluster_info *ci = si->cluster_info + idx;
515
516 VM_BUG_ON(cluster_count(ci) != 0);
517
518
519
520
521
522 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
523 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
524 swap_cluster_schedule_discard(si, idx);
525 return;
526 }
527
528 __free_cluster(si, idx);
529 }
530
531
532
533
534
535 static void inc_cluster_info_page(struct swap_info_struct *p,
536 struct swap_cluster_info *cluster_info, unsigned long page_nr)
537 {
538 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
539
540 if (!cluster_info)
541 return;
542 if (cluster_is_free(&cluster_info[idx]))
543 alloc_cluster(p, idx);
544
545 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
546 cluster_set_count(&cluster_info[idx],
547 cluster_count(&cluster_info[idx]) + 1);
548 }
549
550
551
552
553
554
555 static void dec_cluster_info_page(struct swap_info_struct *p,
556 struct swap_cluster_info *cluster_info, unsigned long page_nr)
557 {
558 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
559
560 if (!cluster_info)
561 return;
562
563 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
564 cluster_set_count(&cluster_info[idx],
565 cluster_count(&cluster_info[idx]) - 1);
566
567 if (cluster_count(&cluster_info[idx]) == 0)
568 free_cluster(p, idx);
569 }
570
571
572
573
574
575 static bool
576 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
577 unsigned long offset)
578 {
579 struct percpu_cluster *percpu_cluster;
580 bool conflict;
581
582 offset /= SWAPFILE_CLUSTER;
583 conflict = !cluster_list_empty(&si->free_clusters) &&
584 offset != cluster_list_first(&si->free_clusters) &&
585 cluster_is_free(&si->cluster_info[offset]);
586
587 if (!conflict)
588 return false;
589
590 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
591 cluster_set_null(&percpu_cluster->index);
592 return true;
593 }
594
595
596
597
598
599 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
600 unsigned long *offset, unsigned long *scan_base)
601 {
602 struct percpu_cluster *cluster;
603 struct swap_cluster_info *ci;
604 bool found_free;
605 unsigned long tmp, max;
606
607 new_cluster:
608 cluster = this_cpu_ptr(si->percpu_cluster);
609 if (cluster_is_null(&cluster->index)) {
610 if (!cluster_list_empty(&si->free_clusters)) {
611 cluster->index = si->free_clusters.head;
612 cluster->next = cluster_next(&cluster->index) *
613 SWAPFILE_CLUSTER;
614 } else if (!cluster_list_empty(&si->discard_clusters)) {
615
616
617
618
619 swap_do_scheduled_discard(si);
620 *scan_base = *offset = si->cluster_next;
621 goto new_cluster;
622 } else
623 return false;
624 }
625
626 found_free = false;
627
628
629
630
631
632 tmp = cluster->next;
633 max = min_t(unsigned long, si->max,
634 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
635 if (tmp >= max) {
636 cluster_set_null(&cluster->index);
637 goto new_cluster;
638 }
639 ci = lock_cluster(si, tmp);
640 while (tmp < max) {
641 if (!si->swap_map[tmp]) {
642 found_free = true;
643 break;
644 }
645 tmp++;
646 }
647 unlock_cluster(ci);
648 if (!found_free) {
649 cluster_set_null(&cluster->index);
650 goto new_cluster;
651 }
652 cluster->next = tmp + 1;
653 *offset = tmp;
654 *scan_base = tmp;
655 return found_free;
656 }
657
658 static void __del_from_avail_list(struct swap_info_struct *p)
659 {
660 int nid;
661
662 for_each_node(nid)
663 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
664 }
665
666 static void del_from_avail_list(struct swap_info_struct *p)
667 {
668 spin_lock(&swap_avail_lock);
669 __del_from_avail_list(p);
670 spin_unlock(&swap_avail_lock);
671 }
672
673 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
674 unsigned int nr_entries)
675 {
676 unsigned int end = offset + nr_entries - 1;
677
678 if (offset == si->lowest_bit)
679 si->lowest_bit += nr_entries;
680 if (end == si->highest_bit)
681 si->highest_bit -= nr_entries;
682 si->inuse_pages += nr_entries;
683 if (si->inuse_pages == si->pages) {
684 si->lowest_bit = si->max;
685 si->highest_bit = 0;
686 del_from_avail_list(si);
687 }
688 }
689
690 static void add_to_avail_list(struct swap_info_struct *p)
691 {
692 int nid;
693
694 spin_lock(&swap_avail_lock);
695 for_each_node(nid) {
696 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
697 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
698 }
699 spin_unlock(&swap_avail_lock);
700 }
701
702 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
703 unsigned int nr_entries)
704 {
705 unsigned long end = offset + nr_entries - 1;
706 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
707
708 if (offset < si->lowest_bit)
709 si->lowest_bit = offset;
710 if (end > si->highest_bit) {
711 bool was_full = !si->highest_bit;
712
713 si->highest_bit = end;
714 if (was_full && (si->flags & SWP_WRITEOK))
715 add_to_avail_list(si);
716 }
717 atomic_long_add(nr_entries, &nr_swap_pages);
718 si->inuse_pages -= nr_entries;
719 if (si->flags & SWP_BLKDEV)
720 swap_slot_free_notify =
721 si->bdev->bd_disk->fops->swap_slot_free_notify;
722 else
723 swap_slot_free_notify = NULL;
724 while (offset <= end) {
725 frontswap_invalidate_page(si->type, offset);
726 if (swap_slot_free_notify)
727 swap_slot_free_notify(si->bdev, offset);
728 offset++;
729 }
730 }
731
732 static int scan_swap_map_slots(struct swap_info_struct *si,
733 unsigned char usage, int nr,
734 swp_entry_t slots[])
735 {
736 struct swap_cluster_info *ci;
737 unsigned long offset;
738 unsigned long scan_base;
739 unsigned long last_in_cluster = 0;
740 int latency_ration = LATENCY_LIMIT;
741 int n_ret = 0;
742
743 if (nr > SWAP_BATCH)
744 nr = SWAP_BATCH;
745
746
747
748
749
750
751
752
753
754
755
756
757 si->flags += SWP_SCANNING;
758 scan_base = offset = si->cluster_next;
759
760
761 if (si->cluster_info) {
762 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
763 goto checks;
764 else
765 goto scan;
766 }
767
768 if (unlikely(!si->cluster_nr--)) {
769 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
770 si->cluster_nr = SWAPFILE_CLUSTER - 1;
771 goto checks;
772 }
773
774 spin_unlock(&si->lock);
775
776
777
778
779
780
781
782 scan_base = offset = si->lowest_bit;
783 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
784
785
786 for (; last_in_cluster <= si->highest_bit; offset++) {
787 if (si->swap_map[offset])
788 last_in_cluster = offset + SWAPFILE_CLUSTER;
789 else if (offset == last_in_cluster) {
790 spin_lock(&si->lock);
791 offset -= SWAPFILE_CLUSTER - 1;
792 si->cluster_next = offset;
793 si->cluster_nr = SWAPFILE_CLUSTER - 1;
794 goto checks;
795 }
796 if (unlikely(--latency_ration < 0)) {
797 cond_resched();
798 latency_ration = LATENCY_LIMIT;
799 }
800 }
801
802 offset = scan_base;
803 spin_lock(&si->lock);
804 si->cluster_nr = SWAPFILE_CLUSTER - 1;
805 }
806
807 checks:
808 if (si->cluster_info) {
809 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
810
811 if (n_ret)
812 goto done;
813 if (!scan_swap_map_try_ssd_cluster(si, &offset,
814 &scan_base))
815 goto scan;
816 }
817 }
818 if (!(si->flags & SWP_WRITEOK))
819 goto no_page;
820 if (!si->highest_bit)
821 goto no_page;
822 if (offset > si->highest_bit)
823 scan_base = offset = si->lowest_bit;
824
825 ci = lock_cluster(si, offset);
826
827 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
828 int swap_was_freed;
829 unlock_cluster(ci);
830 spin_unlock(&si->lock);
831 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
832 spin_lock(&si->lock);
833
834 if (swap_was_freed)
835 goto checks;
836 goto scan;
837 }
838
839 if (si->swap_map[offset]) {
840 unlock_cluster(ci);
841 if (!n_ret)
842 goto scan;
843 else
844 goto done;
845 }
846 si->swap_map[offset] = usage;
847 inc_cluster_info_page(si, si->cluster_info, offset);
848 unlock_cluster(ci);
849
850 swap_range_alloc(si, offset, 1);
851 si->cluster_next = offset + 1;
852 slots[n_ret++] = swp_entry(si->type, offset);
853
854
855 if ((n_ret == nr) || (offset >= si->highest_bit))
856 goto done;
857
858
859
860
861 if (unlikely(--latency_ration < 0)) {
862 if (n_ret)
863 goto done;
864 spin_unlock(&si->lock);
865 cond_resched();
866 spin_lock(&si->lock);
867 latency_ration = LATENCY_LIMIT;
868 }
869
870
871 if (si->cluster_info) {
872 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
873 goto checks;
874 else
875 goto done;
876 }
877
878 ++offset;
879
880
881 if (si->cluster_nr && !si->swap_map[offset]) {
882 --si->cluster_nr;
883 goto checks;
884 }
885
886 done:
887 si->flags -= SWP_SCANNING;
888 return n_ret;
889
890 scan:
891 spin_unlock(&si->lock);
892 while (++offset <= si->highest_bit) {
893 if (!si->swap_map[offset]) {
894 spin_lock(&si->lock);
895 goto checks;
896 }
897 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
898 spin_lock(&si->lock);
899 goto checks;
900 }
901 if (unlikely(--latency_ration < 0)) {
902 cond_resched();
903 latency_ration = LATENCY_LIMIT;
904 }
905 }
906 offset = si->lowest_bit;
907 while (offset < scan_base) {
908 if (!si->swap_map[offset]) {
909 spin_lock(&si->lock);
910 goto checks;
911 }
912 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
913 spin_lock(&si->lock);
914 goto checks;
915 }
916 if (unlikely(--latency_ration < 0)) {
917 cond_resched();
918 latency_ration = LATENCY_LIMIT;
919 }
920 offset++;
921 }
922 spin_lock(&si->lock);
923
924 no_page:
925 si->flags -= SWP_SCANNING;
926 return n_ret;
927 }
928
929 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
930 {
931 unsigned long idx;
932 struct swap_cluster_info *ci;
933 unsigned long offset, i;
934 unsigned char *map;
935
936
937
938
939
940 if (!IS_ENABLED(CONFIG_THP_SWAP)) {
941 VM_WARN_ON_ONCE(1);
942 return 0;
943 }
944
945 if (cluster_list_empty(&si->free_clusters))
946 return 0;
947
948 idx = cluster_list_first(&si->free_clusters);
949 offset = idx * SWAPFILE_CLUSTER;
950 ci = lock_cluster(si, offset);
951 alloc_cluster(si, idx);
952 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
953
954 map = si->swap_map + offset;
955 for (i = 0; i < SWAPFILE_CLUSTER; i++)
956 map[i] = SWAP_HAS_CACHE;
957 unlock_cluster(ci);
958 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
959 *slot = swp_entry(si->type, offset);
960
961 return 1;
962 }
963
964 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
965 {
966 unsigned long offset = idx * SWAPFILE_CLUSTER;
967 struct swap_cluster_info *ci;
968
969 ci = lock_cluster(si, offset);
970 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
971 cluster_set_count_flag(ci, 0, 0);
972 free_cluster(si, idx);
973 unlock_cluster(ci);
974 swap_range_free(si, offset, SWAPFILE_CLUSTER);
975 }
976
977 static unsigned long scan_swap_map(struct swap_info_struct *si,
978 unsigned char usage)
979 {
980 swp_entry_t entry;
981 int n_ret;
982
983 n_ret = scan_swap_map_slots(si, usage, 1, &entry);
984
985 if (n_ret)
986 return swp_offset(entry);
987 else
988 return 0;
989
990 }
991
992 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
993 {
994 unsigned long size = swap_entry_size(entry_size);
995 struct swap_info_struct *si, *next;
996 long avail_pgs;
997 int n_ret = 0;
998 int node;
999
1000
1001 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
1002
1003 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
1004 if (avail_pgs <= 0)
1005 goto noswap;
1006
1007 if (n_goal > SWAP_BATCH)
1008 n_goal = SWAP_BATCH;
1009
1010 if (n_goal > avail_pgs)
1011 n_goal = avail_pgs;
1012
1013 atomic_long_sub(n_goal * size, &nr_swap_pages);
1014
1015 spin_lock(&swap_avail_lock);
1016
1017 start_over:
1018 node = numa_node_id();
1019 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1020
1021 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
1022 spin_unlock(&swap_avail_lock);
1023 spin_lock(&si->lock);
1024 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1025 spin_lock(&swap_avail_lock);
1026 if (plist_node_empty(&si->avail_lists[node])) {
1027 spin_unlock(&si->lock);
1028 goto nextsi;
1029 }
1030 WARN(!si->highest_bit,
1031 "swap_info %d in list but !highest_bit\n",
1032 si->type);
1033 WARN(!(si->flags & SWP_WRITEOK),
1034 "swap_info %d in list but !SWP_WRITEOK\n",
1035 si->type);
1036 __del_from_avail_list(si);
1037 spin_unlock(&si->lock);
1038 goto nextsi;
1039 }
1040 if (size == SWAPFILE_CLUSTER) {
1041 if (!(si->flags & SWP_FS))
1042 n_ret = swap_alloc_cluster(si, swp_entries);
1043 } else
1044 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1045 n_goal, swp_entries);
1046 spin_unlock(&si->lock);
1047 if (n_ret || size == SWAPFILE_CLUSTER)
1048 goto check_out;
1049 pr_debug("scan_swap_map of si %d failed to find offset\n",
1050 si->type);
1051
1052 spin_lock(&swap_avail_lock);
1053 nextsi:
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 if (plist_node_empty(&next->avail_lists[node]))
1066 goto start_over;
1067 }
1068
1069 spin_unlock(&swap_avail_lock);
1070
1071 check_out:
1072 if (n_ret < n_goal)
1073 atomic_long_add((long)(n_goal - n_ret) * size,
1074 &nr_swap_pages);
1075 noswap:
1076 return n_ret;
1077 }
1078
1079
1080 swp_entry_t get_swap_page_of_type(int type)
1081 {
1082 struct swap_info_struct *si = swap_type_to_swap_info(type);
1083 pgoff_t offset;
1084
1085 if (!si)
1086 goto fail;
1087
1088 spin_lock(&si->lock);
1089 if (si->flags & SWP_WRITEOK) {
1090 atomic_long_dec(&nr_swap_pages);
1091
1092 offset = scan_swap_map(si, 1);
1093 if (offset) {
1094 spin_unlock(&si->lock);
1095 return swp_entry(type, offset);
1096 }
1097 atomic_long_inc(&nr_swap_pages);
1098 }
1099 spin_unlock(&si->lock);
1100 fail:
1101 return (swp_entry_t) {0};
1102 }
1103
1104 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1105 {
1106 struct swap_info_struct *p;
1107 unsigned long offset;
1108
1109 if (!entry.val)
1110 goto out;
1111 p = swp_swap_info(entry);
1112 if (!p)
1113 goto bad_nofile;
1114 if (!(p->flags & SWP_USED))
1115 goto bad_device;
1116 offset = swp_offset(entry);
1117 if (offset >= p->max)
1118 goto bad_offset;
1119 return p;
1120
1121 bad_offset:
1122 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1123 goto out;
1124 bad_device:
1125 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1126 goto out;
1127 bad_nofile:
1128 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1129 out:
1130 return NULL;
1131 }
1132
1133 static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1134 {
1135 struct swap_info_struct *p;
1136
1137 p = __swap_info_get(entry);
1138 if (!p)
1139 goto out;
1140 if (!p->swap_map[swp_offset(entry)])
1141 goto bad_free;
1142 return p;
1143
1144 bad_free:
1145 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1146 goto out;
1147 out:
1148 return NULL;
1149 }
1150
1151 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1152 {
1153 struct swap_info_struct *p;
1154
1155 p = _swap_info_get(entry);
1156 if (p)
1157 spin_lock(&p->lock);
1158 return p;
1159 }
1160
1161 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1162 struct swap_info_struct *q)
1163 {
1164 struct swap_info_struct *p;
1165
1166 p = _swap_info_get(entry);
1167
1168 if (p != q) {
1169 if (q != NULL)
1170 spin_unlock(&q->lock);
1171 if (p != NULL)
1172 spin_lock(&p->lock);
1173 }
1174 return p;
1175 }
1176
1177 static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1178 unsigned long offset,
1179 unsigned char usage)
1180 {
1181 unsigned char count;
1182 unsigned char has_cache;
1183
1184 count = p->swap_map[offset];
1185
1186 has_cache = count & SWAP_HAS_CACHE;
1187 count &= ~SWAP_HAS_CACHE;
1188
1189 if (usage == SWAP_HAS_CACHE) {
1190 VM_BUG_ON(!has_cache);
1191 has_cache = 0;
1192 } else if (count == SWAP_MAP_SHMEM) {
1193
1194
1195
1196
1197 count = 0;
1198 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1199 if (count == COUNT_CONTINUED) {
1200 if (swap_count_continued(p, offset, count))
1201 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1202 else
1203 count = SWAP_MAP_MAX;
1204 } else
1205 count--;
1206 }
1207
1208 usage = count | has_cache;
1209 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1210
1211 return usage;
1212 }
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249 struct swap_info_struct *get_swap_device(swp_entry_t entry)
1250 {
1251 struct swap_info_struct *si;
1252 unsigned long offset;
1253
1254 if (!entry.val)
1255 goto out;
1256 si = swp_swap_info(entry);
1257 if (!si)
1258 goto bad_nofile;
1259
1260 rcu_read_lock();
1261 if (!(si->flags & SWP_VALID))
1262 goto unlock_out;
1263 offset = swp_offset(entry);
1264 if (offset >= si->max)
1265 goto unlock_out;
1266
1267 return si;
1268 bad_nofile:
1269 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1270 out:
1271 return NULL;
1272 unlock_out:
1273 rcu_read_unlock();
1274 return NULL;
1275 }
1276
1277 static unsigned char __swap_entry_free(struct swap_info_struct *p,
1278 swp_entry_t entry, unsigned char usage)
1279 {
1280 struct swap_cluster_info *ci;
1281 unsigned long offset = swp_offset(entry);
1282
1283 ci = lock_cluster_or_swap_info(p, offset);
1284 usage = __swap_entry_free_locked(p, offset, usage);
1285 unlock_cluster_or_swap_info(p, ci);
1286 if (!usage)
1287 free_swap_slot(entry);
1288
1289 return usage;
1290 }
1291
1292 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1293 {
1294 struct swap_cluster_info *ci;
1295 unsigned long offset = swp_offset(entry);
1296 unsigned char count;
1297
1298 ci = lock_cluster(p, offset);
1299 count = p->swap_map[offset];
1300 VM_BUG_ON(count != SWAP_HAS_CACHE);
1301 p->swap_map[offset] = 0;
1302 dec_cluster_info_page(p, p->cluster_info, offset);
1303 unlock_cluster(ci);
1304
1305 mem_cgroup_uncharge_swap(entry, 1);
1306 swap_range_free(p, offset, 1);
1307 }
1308
1309
1310
1311
1312
1313 void swap_free(swp_entry_t entry)
1314 {
1315 struct swap_info_struct *p;
1316
1317 p = _swap_info_get(entry);
1318 if (p)
1319 __swap_entry_free(p, entry, 1);
1320 }
1321
1322
1323
1324
1325 void put_swap_page(struct page *page, swp_entry_t entry)
1326 {
1327 unsigned long offset = swp_offset(entry);
1328 unsigned long idx = offset / SWAPFILE_CLUSTER;
1329 struct swap_cluster_info *ci;
1330 struct swap_info_struct *si;
1331 unsigned char *map;
1332 unsigned int i, free_entries = 0;
1333 unsigned char val;
1334 int size = swap_entry_size(hpage_nr_pages(page));
1335
1336 si = _swap_info_get(entry);
1337 if (!si)
1338 return;
1339
1340 ci = lock_cluster_or_swap_info(si, offset);
1341 if (size == SWAPFILE_CLUSTER) {
1342 VM_BUG_ON(!cluster_is_huge(ci));
1343 map = si->swap_map + offset;
1344 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1345 val = map[i];
1346 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1347 if (val == SWAP_HAS_CACHE)
1348 free_entries++;
1349 }
1350 cluster_clear_huge(ci);
1351 if (free_entries == SWAPFILE_CLUSTER) {
1352 unlock_cluster_or_swap_info(si, ci);
1353 spin_lock(&si->lock);
1354 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1355 swap_free_cluster(si, idx);
1356 spin_unlock(&si->lock);
1357 return;
1358 }
1359 }
1360 for (i = 0; i < size; i++, entry.val++) {
1361 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1362 unlock_cluster_or_swap_info(si, ci);
1363 free_swap_slot(entry);
1364 if (i == size - 1)
1365 return;
1366 lock_cluster_or_swap_info(si, offset);
1367 }
1368 }
1369 unlock_cluster_or_swap_info(si, ci);
1370 }
1371
1372 #ifdef CONFIG_THP_SWAP
1373 int split_swap_cluster(swp_entry_t entry)
1374 {
1375 struct swap_info_struct *si;
1376 struct swap_cluster_info *ci;
1377 unsigned long offset = swp_offset(entry);
1378
1379 si = _swap_info_get(entry);
1380 if (!si)
1381 return -EBUSY;
1382 ci = lock_cluster(si, offset);
1383 cluster_clear_huge(ci);
1384 unlock_cluster(ci);
1385 return 0;
1386 }
1387 #endif
1388
1389 static int swp_entry_cmp(const void *ent1, const void *ent2)
1390 {
1391 const swp_entry_t *e1 = ent1, *e2 = ent2;
1392
1393 return (int)swp_type(*e1) - (int)swp_type(*e2);
1394 }
1395
1396 void swapcache_free_entries(swp_entry_t *entries, int n)
1397 {
1398 struct swap_info_struct *p, *prev;
1399 int i;
1400
1401 if (n <= 0)
1402 return;
1403
1404 prev = NULL;
1405 p = NULL;
1406
1407
1408
1409
1410
1411
1412 if (nr_swapfiles > 1)
1413 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1414 for (i = 0; i < n; ++i) {
1415 p = swap_info_get_cont(entries[i], prev);
1416 if (p)
1417 swap_entry_free(p, entries[i]);
1418 prev = p;
1419 }
1420 if (p)
1421 spin_unlock(&p->lock);
1422 }
1423
1424
1425
1426
1427
1428
1429 int page_swapcount(struct page *page)
1430 {
1431 int count = 0;
1432 struct swap_info_struct *p;
1433 struct swap_cluster_info *ci;
1434 swp_entry_t entry;
1435 unsigned long offset;
1436
1437 entry.val = page_private(page);
1438 p = _swap_info_get(entry);
1439 if (p) {
1440 offset = swp_offset(entry);
1441 ci = lock_cluster_or_swap_info(p, offset);
1442 count = swap_count(p->swap_map[offset]);
1443 unlock_cluster_or_swap_info(p, ci);
1444 }
1445 return count;
1446 }
1447
1448 int __swap_count(swp_entry_t entry)
1449 {
1450 struct swap_info_struct *si;
1451 pgoff_t offset = swp_offset(entry);
1452 int count = 0;
1453
1454 si = get_swap_device(entry);
1455 if (si) {
1456 count = swap_count(si->swap_map[offset]);
1457 put_swap_device(si);
1458 }
1459 return count;
1460 }
1461
1462 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1463 {
1464 int count = 0;
1465 pgoff_t offset = swp_offset(entry);
1466 struct swap_cluster_info *ci;
1467
1468 ci = lock_cluster_or_swap_info(si, offset);
1469 count = swap_count(si->swap_map[offset]);
1470 unlock_cluster_or_swap_info(si, ci);
1471 return count;
1472 }
1473
1474
1475
1476
1477
1478
1479 int __swp_swapcount(swp_entry_t entry)
1480 {
1481 int count = 0;
1482 struct swap_info_struct *si;
1483
1484 si = get_swap_device(entry);
1485 if (si) {
1486 count = swap_swapcount(si, entry);
1487 put_swap_device(si);
1488 }
1489 return count;
1490 }
1491
1492
1493
1494
1495
1496 int swp_swapcount(swp_entry_t entry)
1497 {
1498 int count, tmp_count, n;
1499 struct swap_info_struct *p;
1500 struct swap_cluster_info *ci;
1501 struct page *page;
1502 pgoff_t offset;
1503 unsigned char *map;
1504
1505 p = _swap_info_get(entry);
1506 if (!p)
1507 return 0;
1508
1509 offset = swp_offset(entry);
1510
1511 ci = lock_cluster_or_swap_info(p, offset);
1512
1513 count = swap_count(p->swap_map[offset]);
1514 if (!(count & COUNT_CONTINUED))
1515 goto out;
1516
1517 count &= ~COUNT_CONTINUED;
1518 n = SWAP_MAP_MAX + 1;
1519
1520 page = vmalloc_to_page(p->swap_map + offset);
1521 offset &= ~PAGE_MASK;
1522 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1523
1524 do {
1525 page = list_next_entry(page, lru);
1526 map = kmap_atomic(page);
1527 tmp_count = map[offset];
1528 kunmap_atomic(map);
1529
1530 count += (tmp_count & ~COUNT_CONTINUED) * n;
1531 n *= (SWAP_CONT_MAX + 1);
1532 } while (tmp_count & COUNT_CONTINUED);
1533 out:
1534 unlock_cluster_or_swap_info(p, ci);
1535 return count;
1536 }
1537
1538 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1539 swp_entry_t entry)
1540 {
1541 struct swap_cluster_info *ci;
1542 unsigned char *map = si->swap_map;
1543 unsigned long roffset = swp_offset(entry);
1544 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1545 int i;
1546 bool ret = false;
1547
1548 ci = lock_cluster_or_swap_info(si, offset);
1549 if (!ci || !cluster_is_huge(ci)) {
1550 if (swap_count(map[roffset]))
1551 ret = true;
1552 goto unlock_out;
1553 }
1554 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1555 if (swap_count(map[offset + i])) {
1556 ret = true;
1557 break;
1558 }
1559 }
1560 unlock_out:
1561 unlock_cluster_or_swap_info(si, ci);
1562 return ret;
1563 }
1564
1565 static bool page_swapped(struct page *page)
1566 {
1567 swp_entry_t entry;
1568 struct swap_info_struct *si;
1569
1570 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1571 return page_swapcount(page) != 0;
1572
1573 page = compound_head(page);
1574 entry.val = page_private(page);
1575 si = _swap_info_get(entry);
1576 if (si)
1577 return swap_page_trans_huge_swapped(si, entry);
1578 return false;
1579 }
1580
1581 static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1582 int *total_swapcount)
1583 {
1584 int i, map_swapcount, _total_mapcount, _total_swapcount;
1585 unsigned long offset = 0;
1586 struct swap_info_struct *si;
1587 struct swap_cluster_info *ci = NULL;
1588 unsigned char *map = NULL;
1589 int mapcount, swapcount = 0;
1590
1591
1592 VM_BUG_ON_PAGE(PageHuge(page), page);
1593
1594 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1595 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1596 if (PageSwapCache(page))
1597 swapcount = page_swapcount(page);
1598 if (total_swapcount)
1599 *total_swapcount = swapcount;
1600 return mapcount + swapcount;
1601 }
1602
1603 page = compound_head(page);
1604
1605 _total_mapcount = _total_swapcount = map_swapcount = 0;
1606 if (PageSwapCache(page)) {
1607 swp_entry_t entry;
1608
1609 entry.val = page_private(page);
1610 si = _swap_info_get(entry);
1611 if (si) {
1612 map = si->swap_map;
1613 offset = swp_offset(entry);
1614 }
1615 }
1616 if (map)
1617 ci = lock_cluster(si, offset);
1618 for (i = 0; i < HPAGE_PMD_NR; i++) {
1619 mapcount = atomic_read(&page[i]._mapcount) + 1;
1620 _total_mapcount += mapcount;
1621 if (map) {
1622 swapcount = swap_count(map[offset + i]);
1623 _total_swapcount += swapcount;
1624 }
1625 map_swapcount = max(map_swapcount, mapcount + swapcount);
1626 }
1627 unlock_cluster(ci);
1628 if (PageDoubleMap(page)) {
1629 map_swapcount -= 1;
1630 _total_mapcount -= HPAGE_PMD_NR;
1631 }
1632 mapcount = compound_mapcount(page);
1633 map_swapcount += mapcount;
1634 _total_mapcount += mapcount;
1635 if (total_mapcount)
1636 *total_mapcount = _total_mapcount;
1637 if (total_swapcount)
1638 *total_swapcount = _total_swapcount;
1639
1640 return map_swapcount;
1641 }
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653 bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1654 {
1655 int count, total_mapcount, total_swapcount;
1656
1657 VM_BUG_ON_PAGE(!PageLocked(page), page);
1658 if (unlikely(PageKsm(page)))
1659 return false;
1660 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1661 &total_swapcount);
1662 if (total_map_swapcount)
1663 *total_map_swapcount = total_mapcount + total_swapcount;
1664 if (count == 1 && PageSwapCache(page) &&
1665 (likely(!PageTransCompound(page)) ||
1666
1667 total_swapcount == page_swapcount(page))) {
1668 if (!PageWriteback(page)) {
1669 page = compound_head(page);
1670 delete_from_swap_cache(page);
1671 SetPageDirty(page);
1672 } else {
1673 swp_entry_t entry;
1674 struct swap_info_struct *p;
1675
1676 entry.val = page_private(page);
1677 p = swap_info_get(entry);
1678 if (p->flags & SWP_STABLE_WRITES) {
1679 spin_unlock(&p->lock);
1680 return false;
1681 }
1682 spin_unlock(&p->lock);
1683 }
1684 }
1685
1686 return count <= 1;
1687 }
1688
1689
1690
1691
1692
1693 int try_to_free_swap(struct page *page)
1694 {
1695 VM_BUG_ON_PAGE(!PageLocked(page), page);
1696
1697 if (!PageSwapCache(page))
1698 return 0;
1699 if (PageWriteback(page))
1700 return 0;
1701 if (page_swapped(page))
1702 return 0;
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719 if (pm_suspended_storage())
1720 return 0;
1721
1722 page = compound_head(page);
1723 delete_from_swap_cache(page);
1724 SetPageDirty(page);
1725 return 1;
1726 }
1727
1728
1729
1730
1731
1732 int free_swap_and_cache(swp_entry_t entry)
1733 {
1734 struct swap_info_struct *p;
1735 unsigned char count;
1736
1737 if (non_swap_entry(entry))
1738 return 1;
1739
1740 p = _swap_info_get(entry);
1741 if (p) {
1742 count = __swap_entry_free(p, entry, 1);
1743 if (count == SWAP_HAS_CACHE &&
1744 !swap_page_trans_huge_swapped(p, entry))
1745 __try_to_reclaim_swap(p, swp_offset(entry),
1746 TTRS_UNMAPPED | TTRS_FULL);
1747 }
1748 return p != NULL;
1749 }
1750
1751 #ifdef CONFIG_HIBERNATION
1752
1753
1754
1755
1756
1757
1758
1759
1760 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1761 {
1762 struct block_device *bdev = NULL;
1763 int type;
1764
1765 if (device)
1766 bdev = bdget(device);
1767
1768 spin_lock(&swap_lock);
1769 for (type = 0; type < nr_swapfiles; type++) {
1770 struct swap_info_struct *sis = swap_info[type];
1771
1772 if (!(sis->flags & SWP_WRITEOK))
1773 continue;
1774
1775 if (!bdev) {
1776 if (bdev_p)
1777 *bdev_p = bdgrab(sis->bdev);
1778
1779 spin_unlock(&swap_lock);
1780 return type;
1781 }
1782 if (bdev == sis->bdev) {
1783 struct swap_extent *se = first_se(sis);
1784
1785 if (se->start_block == offset) {
1786 if (bdev_p)
1787 *bdev_p = bdgrab(sis->bdev);
1788
1789 spin_unlock(&swap_lock);
1790 bdput(bdev);
1791 return type;
1792 }
1793 }
1794 }
1795 spin_unlock(&swap_lock);
1796 if (bdev)
1797 bdput(bdev);
1798
1799 return -ENODEV;
1800 }
1801
1802
1803
1804
1805
1806 sector_t swapdev_block(int type, pgoff_t offset)
1807 {
1808 struct block_device *bdev;
1809 struct swap_info_struct *si = swap_type_to_swap_info(type);
1810
1811 if (!si || !(si->flags & SWP_WRITEOK))
1812 return 0;
1813 return map_swap_entry(swp_entry(type, offset), &bdev);
1814 }
1815
1816
1817
1818
1819
1820
1821
1822 unsigned int count_swap_pages(int type, int free)
1823 {
1824 unsigned int n = 0;
1825
1826 spin_lock(&swap_lock);
1827 if ((unsigned int)type < nr_swapfiles) {
1828 struct swap_info_struct *sis = swap_info[type];
1829
1830 spin_lock(&sis->lock);
1831 if (sis->flags & SWP_WRITEOK) {
1832 n = sis->pages;
1833 if (free)
1834 n -= sis->inuse_pages;
1835 }
1836 spin_unlock(&sis->lock);
1837 }
1838 spin_unlock(&swap_lock);
1839 return n;
1840 }
1841 #endif
1842
1843 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1844 {
1845 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1846 }
1847
1848
1849
1850
1851
1852
1853 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1854 unsigned long addr, swp_entry_t entry, struct page *page)
1855 {
1856 struct page *swapcache;
1857 struct mem_cgroup *memcg;
1858 spinlock_t *ptl;
1859 pte_t *pte;
1860 int ret = 1;
1861
1862 swapcache = page;
1863 page = ksm_might_need_to_copy(page, vma, addr);
1864 if (unlikely(!page))
1865 return -ENOMEM;
1866
1867 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1868 &memcg, false)) {
1869 ret = -ENOMEM;
1870 goto out_nolock;
1871 }
1872
1873 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1874 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1875 mem_cgroup_cancel_charge(page, memcg, false);
1876 ret = 0;
1877 goto out;
1878 }
1879
1880 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1881 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1882 get_page(page);
1883 set_pte_at(vma->vm_mm, addr, pte,
1884 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1885 if (page == swapcache) {
1886 page_add_anon_rmap(page, vma, addr, false);
1887 mem_cgroup_commit_charge(page, memcg, true, false);
1888 } else {
1889 page_add_new_anon_rmap(page, vma, addr, false);
1890 mem_cgroup_commit_charge(page, memcg, false, false);
1891 lru_cache_add_active_or_unevictable(page, vma);
1892 }
1893 swap_free(entry);
1894
1895
1896
1897
1898 activate_page(page);
1899 out:
1900 pte_unmap_unlock(pte, ptl);
1901 out_nolock:
1902 if (page != swapcache) {
1903 unlock_page(page);
1904 put_page(page);
1905 }
1906 return ret;
1907 }
1908
1909 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1910 unsigned long addr, unsigned long end,
1911 unsigned int type, bool frontswap,
1912 unsigned long *fs_pages_to_unuse)
1913 {
1914 struct page *page;
1915 swp_entry_t entry;
1916 pte_t *pte;
1917 struct swap_info_struct *si;
1918 unsigned long offset;
1919 int ret = 0;
1920 volatile unsigned char *swap_map;
1921
1922 si = swap_info[type];
1923 pte = pte_offset_map(pmd, addr);
1924 do {
1925 struct vm_fault vmf;
1926
1927 if (!is_swap_pte(*pte))
1928 continue;
1929
1930 entry = pte_to_swp_entry(*pte);
1931 if (swp_type(entry) != type)
1932 continue;
1933
1934 offset = swp_offset(entry);
1935 if (frontswap && !frontswap_test(si, offset))
1936 continue;
1937
1938 pte_unmap(pte);
1939 swap_map = &si->swap_map[offset];
1940 vmf.vma = vma;
1941 vmf.address = addr;
1942 vmf.pmd = pmd;
1943 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
1944 if (!page) {
1945 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1946 goto try_next;
1947 return -ENOMEM;
1948 }
1949
1950 lock_page(page);
1951 wait_on_page_writeback(page);
1952 ret = unuse_pte(vma, pmd, addr, entry, page);
1953 if (ret < 0) {
1954 unlock_page(page);
1955 put_page(page);
1956 goto out;
1957 }
1958
1959 try_to_free_swap(page);
1960 unlock_page(page);
1961 put_page(page);
1962
1963 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
1964 ret = FRONTSWAP_PAGES_UNUSED;
1965 goto out;
1966 }
1967 try_next:
1968 pte = pte_offset_map(pmd, addr);
1969 } while (pte++, addr += PAGE_SIZE, addr != end);
1970 pte_unmap(pte - 1);
1971
1972 ret = 0;
1973 out:
1974 return ret;
1975 }
1976
1977 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1978 unsigned long addr, unsigned long end,
1979 unsigned int type, bool frontswap,
1980 unsigned long *fs_pages_to_unuse)
1981 {
1982 pmd_t *pmd;
1983 unsigned long next;
1984 int ret;
1985
1986 pmd = pmd_offset(pud, addr);
1987 do {
1988 cond_resched();
1989 next = pmd_addr_end(addr, end);
1990 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1991 continue;
1992 ret = unuse_pte_range(vma, pmd, addr, next, type,
1993 frontswap, fs_pages_to_unuse);
1994 if (ret)
1995 return ret;
1996 } while (pmd++, addr = next, addr != end);
1997 return 0;
1998 }
1999
2000 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
2001 unsigned long addr, unsigned long end,
2002 unsigned int type, bool frontswap,
2003 unsigned long *fs_pages_to_unuse)
2004 {
2005 pud_t *pud;
2006 unsigned long next;
2007 int ret;
2008
2009 pud = pud_offset(p4d, addr);
2010 do {
2011 next = pud_addr_end(addr, end);
2012 if (pud_none_or_clear_bad(pud))
2013 continue;
2014 ret = unuse_pmd_range(vma, pud, addr, next, type,
2015 frontswap, fs_pages_to_unuse);
2016 if (ret)
2017 return ret;
2018 } while (pud++, addr = next, addr != end);
2019 return 0;
2020 }
2021
2022 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
2023 unsigned long addr, unsigned long end,
2024 unsigned int type, bool frontswap,
2025 unsigned long *fs_pages_to_unuse)
2026 {
2027 p4d_t *p4d;
2028 unsigned long next;
2029 int ret;
2030
2031 p4d = p4d_offset(pgd, addr);
2032 do {
2033 next = p4d_addr_end(addr, end);
2034 if (p4d_none_or_clear_bad(p4d))
2035 continue;
2036 ret = unuse_pud_range(vma, p4d, addr, next, type,
2037 frontswap, fs_pages_to_unuse);
2038 if (ret)
2039 return ret;
2040 } while (p4d++, addr = next, addr != end);
2041 return 0;
2042 }
2043
2044 static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
2045 bool frontswap, unsigned long *fs_pages_to_unuse)
2046 {
2047 pgd_t *pgd;
2048 unsigned long addr, end, next;
2049 int ret;
2050
2051 addr = vma->vm_start;
2052 end = vma->vm_end;
2053
2054 pgd = pgd_offset(vma->vm_mm, addr);
2055 do {
2056 next = pgd_addr_end(addr, end);
2057 if (pgd_none_or_clear_bad(pgd))
2058 continue;
2059 ret = unuse_p4d_range(vma, pgd, addr, next, type,
2060 frontswap, fs_pages_to_unuse);
2061 if (ret)
2062 return ret;
2063 } while (pgd++, addr = next, addr != end);
2064 return 0;
2065 }
2066
2067 static int unuse_mm(struct mm_struct *mm, unsigned int type,
2068 bool frontswap, unsigned long *fs_pages_to_unuse)
2069 {
2070 struct vm_area_struct *vma;
2071 int ret = 0;
2072
2073 down_read(&mm->mmap_sem);
2074 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2075 if (vma->anon_vma) {
2076 ret = unuse_vma(vma, type, frontswap,
2077 fs_pages_to_unuse);
2078 if (ret)
2079 break;
2080 }
2081 cond_resched();
2082 }
2083 up_read(&mm->mmap_sem);
2084 return ret;
2085 }
2086
2087
2088
2089
2090
2091
2092 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2093 unsigned int prev, bool frontswap)
2094 {
2095 unsigned int i;
2096 unsigned char count;
2097
2098
2099
2100
2101
2102
2103
2104 for (i = prev + 1; i < si->max; i++) {
2105 count = READ_ONCE(si->swap_map[i]);
2106 if (count && swap_count(count) != SWAP_MAP_BAD)
2107 if (!frontswap || frontswap_test(si, i))
2108 break;
2109 if ((i % LATENCY_LIMIT) == 0)
2110 cond_resched();
2111 }
2112
2113 if (i == si->max)
2114 i = 0;
2115
2116 return i;
2117 }
2118
2119
2120
2121
2122
2123 int try_to_unuse(unsigned int type, bool frontswap,
2124 unsigned long pages_to_unuse)
2125 {
2126 struct mm_struct *prev_mm;
2127 struct mm_struct *mm;
2128 struct list_head *p;
2129 int retval = 0;
2130 struct swap_info_struct *si = swap_info[type];
2131 struct page *page;
2132 swp_entry_t entry;
2133 unsigned int i;
2134
2135 if (!si->inuse_pages)
2136 return 0;
2137
2138 if (!frontswap)
2139 pages_to_unuse = 0;
2140
2141 retry:
2142 retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2143 if (retval)
2144 goto out;
2145
2146 prev_mm = &init_mm;
2147 mmget(prev_mm);
2148
2149 spin_lock(&mmlist_lock);
2150 p = &init_mm.mmlist;
2151 while (si->inuse_pages &&
2152 !signal_pending(current) &&
2153 (p = p->next) != &init_mm.mmlist) {
2154
2155 mm = list_entry(p, struct mm_struct, mmlist);
2156 if (!mmget_not_zero(mm))
2157 continue;
2158 spin_unlock(&mmlist_lock);
2159 mmput(prev_mm);
2160 prev_mm = mm;
2161 retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2162
2163 if (retval) {
2164 mmput(prev_mm);
2165 goto out;
2166 }
2167
2168
2169
2170
2171
2172 cond_resched();
2173 spin_lock(&mmlist_lock);
2174 }
2175 spin_unlock(&mmlist_lock);
2176
2177 mmput(prev_mm);
2178
2179 i = 0;
2180 while (si->inuse_pages &&
2181 !signal_pending(current) &&
2182 (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2183
2184 entry = swp_entry(type, i);
2185 page = find_get_page(swap_address_space(entry), i);
2186 if (!page)
2187 continue;
2188
2189
2190
2191
2192
2193
2194
2195 lock_page(page);
2196 wait_on_page_writeback(page);
2197 try_to_free_swap(page);
2198 unlock_page(page);
2199 put_page(page);
2200
2201
2202
2203
2204
2205
2206 if (pages_to_unuse && --pages_to_unuse == 0)
2207 goto out;
2208 }
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222 if (si->inuse_pages) {
2223 if (!signal_pending(current))
2224 goto retry;
2225 retval = -EINTR;
2226 }
2227 out:
2228 return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2229 }
2230
2231
2232
2233
2234
2235
2236
2237 static void drain_mmlist(void)
2238 {
2239 struct list_head *p, *next;
2240 unsigned int type;
2241
2242 for (type = 0; type < nr_swapfiles; type++)
2243 if (swap_info[type]->inuse_pages)
2244 return;
2245 spin_lock(&mmlist_lock);
2246 list_for_each_safe(p, next, &init_mm.mmlist)
2247 list_del_init(p);
2248 spin_unlock(&mmlist_lock);
2249 }
2250
2251
2252
2253
2254
2255
2256
2257 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2258 {
2259 struct swap_info_struct *sis;
2260 struct swap_extent *se;
2261 pgoff_t offset;
2262
2263 sis = swp_swap_info(entry);
2264 *bdev = sis->bdev;
2265
2266 offset = swp_offset(entry);
2267 se = offset_to_swap_extent(sis, offset);
2268 return se->start_block + (offset - se->start_page);
2269 }
2270
2271
2272
2273
2274 sector_t map_swap_page(struct page *page, struct block_device **bdev)
2275 {
2276 swp_entry_t entry;
2277 entry.val = page_private(page);
2278 return map_swap_entry(entry, bdev);
2279 }
2280
2281
2282
2283
2284 static void destroy_swap_extents(struct swap_info_struct *sis)
2285 {
2286 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2287 struct rb_node *rb = sis->swap_extent_root.rb_node;
2288 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2289
2290 rb_erase(rb, &sis->swap_extent_root);
2291 kfree(se);
2292 }
2293
2294 if (sis->flags & SWP_ACTIVATED) {
2295 struct file *swap_file = sis->swap_file;
2296 struct address_space *mapping = swap_file->f_mapping;
2297
2298 sis->flags &= ~SWP_ACTIVATED;
2299 if (mapping->a_ops->swap_deactivate)
2300 mapping->a_ops->swap_deactivate(swap_file);
2301 }
2302 }
2303
2304
2305
2306
2307
2308
2309
2310 int
2311 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2312 unsigned long nr_pages, sector_t start_block)
2313 {
2314 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2315 struct swap_extent *se;
2316 struct swap_extent *new_se;
2317
2318
2319
2320
2321
2322 while (*link) {
2323 parent = *link;
2324 link = &parent->rb_right;
2325 }
2326
2327 if (parent) {
2328 se = rb_entry(parent, struct swap_extent, rb_node);
2329 BUG_ON(se->start_page + se->nr_pages != start_page);
2330 if (se->start_block + se->nr_pages == start_block) {
2331
2332 se->nr_pages += nr_pages;
2333 return 0;
2334 }
2335 }
2336
2337
2338 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2339 if (new_se == NULL)
2340 return -ENOMEM;
2341 new_se->start_page = start_page;
2342 new_se->nr_pages = nr_pages;
2343 new_se->start_block = start_block;
2344
2345 rb_link_node(&new_se->rb_node, parent, link);
2346 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2347 return 1;
2348 }
2349 EXPORT_SYMBOL_GPL(add_swap_extent);
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2382 {
2383 struct file *swap_file = sis->swap_file;
2384 struct address_space *mapping = swap_file->f_mapping;
2385 struct inode *inode = mapping->host;
2386 int ret;
2387
2388 if (S_ISBLK(inode->i_mode)) {
2389 ret = add_swap_extent(sis, 0, sis->max, 0);
2390 *span = sis->pages;
2391 return ret;
2392 }
2393
2394 if (mapping->a_ops->swap_activate) {
2395 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2396 if (ret >= 0)
2397 sis->flags |= SWP_ACTIVATED;
2398 if (!ret) {
2399 sis->flags |= SWP_FS;
2400 ret = add_swap_extent(sis, 0, sis->max, 0);
2401 *span = sis->pages;
2402 }
2403 return ret;
2404 }
2405
2406 return generic_swapfile_activate(sis, swap_file, span);
2407 }
2408
2409 static int swap_node(struct swap_info_struct *p)
2410 {
2411 struct block_device *bdev;
2412
2413 if (p->bdev)
2414 bdev = p->bdev;
2415 else
2416 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2417
2418 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2419 }
2420
2421 static void setup_swap_info(struct swap_info_struct *p, int prio,
2422 unsigned char *swap_map,
2423 struct swap_cluster_info *cluster_info)
2424 {
2425 int i;
2426
2427 if (prio >= 0)
2428 p->prio = prio;
2429 else
2430 p->prio = --least_priority;
2431
2432
2433
2434
2435 p->list.prio = -p->prio;
2436 for_each_node(i) {
2437 if (p->prio >= 0)
2438 p->avail_lists[i].prio = -p->prio;
2439 else {
2440 if (swap_node(p) == i)
2441 p->avail_lists[i].prio = 1;
2442 else
2443 p->avail_lists[i].prio = -p->prio;
2444 }
2445 }
2446 p->swap_map = swap_map;
2447 p->cluster_info = cluster_info;
2448 }
2449
2450 static void _enable_swap_info(struct swap_info_struct *p)
2451 {
2452 p->flags |= SWP_WRITEOK | SWP_VALID;
2453 atomic_long_add(p->pages, &nr_swap_pages);
2454 total_swap_pages += p->pages;
2455
2456 assert_spin_locked(&swap_lock);
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467 plist_add(&p->list, &swap_active_head);
2468 add_to_avail_list(p);
2469 }
2470
2471 static void enable_swap_info(struct swap_info_struct *p, int prio,
2472 unsigned char *swap_map,
2473 struct swap_cluster_info *cluster_info,
2474 unsigned long *frontswap_map)
2475 {
2476 frontswap_init(p->type, frontswap_map);
2477 spin_lock(&swap_lock);
2478 spin_lock(&p->lock);
2479 setup_swap_info(p, prio, swap_map, cluster_info);
2480 spin_unlock(&p->lock);
2481 spin_unlock(&swap_lock);
2482
2483
2484
2485
2486 synchronize_rcu();
2487 spin_lock(&swap_lock);
2488 spin_lock(&p->lock);
2489 _enable_swap_info(p);
2490 spin_unlock(&p->lock);
2491 spin_unlock(&swap_lock);
2492 }
2493
2494 static void reinsert_swap_info(struct swap_info_struct *p)
2495 {
2496 spin_lock(&swap_lock);
2497 spin_lock(&p->lock);
2498 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2499 _enable_swap_info(p);
2500 spin_unlock(&p->lock);
2501 spin_unlock(&swap_lock);
2502 }
2503
2504 bool has_usable_swap(void)
2505 {
2506 bool ret = true;
2507
2508 spin_lock(&swap_lock);
2509 if (plist_head_empty(&swap_active_head))
2510 ret = false;
2511 spin_unlock(&swap_lock);
2512 return ret;
2513 }
2514
2515 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2516 {
2517 struct swap_info_struct *p = NULL;
2518 unsigned char *swap_map;
2519 struct swap_cluster_info *cluster_info;
2520 unsigned long *frontswap_map;
2521 struct file *swap_file, *victim;
2522 struct address_space *mapping;
2523 struct inode *inode;
2524 struct filename *pathname;
2525 int err, found = 0;
2526 unsigned int old_block_size;
2527
2528 if (!capable(CAP_SYS_ADMIN))
2529 return -EPERM;
2530
2531 BUG_ON(!current->mm);
2532
2533 pathname = getname(specialfile);
2534 if (IS_ERR(pathname))
2535 return PTR_ERR(pathname);
2536
2537 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2538 err = PTR_ERR(victim);
2539 if (IS_ERR(victim))
2540 goto out;
2541
2542 mapping = victim->f_mapping;
2543 spin_lock(&swap_lock);
2544 plist_for_each_entry(p, &swap_active_head, list) {
2545 if (p->flags & SWP_WRITEOK) {
2546 if (p->swap_file->f_mapping == mapping) {
2547 found = 1;
2548 break;
2549 }
2550 }
2551 }
2552 if (!found) {
2553 err = -EINVAL;
2554 spin_unlock(&swap_lock);
2555 goto out_dput;
2556 }
2557 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2558 vm_unacct_memory(p->pages);
2559 else {
2560 err = -ENOMEM;
2561 spin_unlock(&swap_lock);
2562 goto out_dput;
2563 }
2564 del_from_avail_list(p);
2565 spin_lock(&p->lock);
2566 if (p->prio < 0) {
2567 struct swap_info_struct *si = p;
2568 int nid;
2569
2570 plist_for_each_entry_continue(si, &swap_active_head, list) {
2571 si->prio++;
2572 si->list.prio--;
2573 for_each_node(nid) {
2574 if (si->avail_lists[nid].prio != 1)
2575 si->avail_lists[nid].prio--;
2576 }
2577 }
2578 least_priority++;
2579 }
2580 plist_del(&p->list, &swap_active_head);
2581 atomic_long_sub(p->pages, &nr_swap_pages);
2582 total_swap_pages -= p->pages;
2583 p->flags &= ~SWP_WRITEOK;
2584 spin_unlock(&p->lock);
2585 spin_unlock(&swap_lock);
2586
2587 disable_swap_slots_cache_lock();
2588
2589 set_current_oom_origin();
2590 err = try_to_unuse(p->type, false, 0);
2591 clear_current_oom_origin();
2592
2593 if (err) {
2594
2595 reinsert_swap_info(p);
2596 reenable_swap_slots_cache_unlock();
2597 goto out_dput;
2598 }
2599
2600 reenable_swap_slots_cache_unlock();
2601
2602 spin_lock(&swap_lock);
2603 spin_lock(&p->lock);
2604 p->flags &= ~SWP_VALID;
2605 spin_unlock(&p->lock);
2606 spin_unlock(&swap_lock);
2607
2608
2609
2610
2611 synchronize_rcu();
2612
2613 flush_work(&p->discard_work);
2614
2615 destroy_swap_extents(p);
2616 if (p->flags & SWP_CONTINUED)
2617 free_swap_count_continuations(p);
2618
2619 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2620 atomic_dec(&nr_rotate_swap);
2621
2622 mutex_lock(&swapon_mutex);
2623 spin_lock(&swap_lock);
2624 spin_lock(&p->lock);
2625 drain_mmlist();
2626
2627
2628 p->highest_bit = 0;
2629 while (p->flags >= SWP_SCANNING) {
2630 spin_unlock(&p->lock);
2631 spin_unlock(&swap_lock);
2632 schedule_timeout_uninterruptible(1);
2633 spin_lock(&swap_lock);
2634 spin_lock(&p->lock);
2635 }
2636
2637 swap_file = p->swap_file;
2638 old_block_size = p->old_block_size;
2639 p->swap_file = NULL;
2640 p->max = 0;
2641 swap_map = p->swap_map;
2642 p->swap_map = NULL;
2643 cluster_info = p->cluster_info;
2644 p->cluster_info = NULL;
2645 frontswap_map = frontswap_map_get(p);
2646 spin_unlock(&p->lock);
2647 spin_unlock(&swap_lock);
2648 frontswap_invalidate_area(p->type);
2649 frontswap_map_set(p, NULL);
2650 mutex_unlock(&swapon_mutex);
2651 free_percpu(p->percpu_cluster);
2652 p->percpu_cluster = NULL;
2653 vfree(swap_map);
2654 kvfree(cluster_info);
2655 kvfree(frontswap_map);
2656
2657 swap_cgroup_swapoff(p->type);
2658 exit_swap_address_space(p->type);
2659
2660 inode = mapping->host;
2661 if (S_ISBLK(inode->i_mode)) {
2662 struct block_device *bdev = I_BDEV(inode);
2663
2664 set_blocksize(bdev, old_block_size);
2665 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2666 }
2667
2668 inode_lock(inode);
2669 inode->i_flags &= ~S_SWAPFILE;
2670 inode_unlock(inode);
2671 filp_close(swap_file, NULL);
2672
2673
2674
2675
2676
2677
2678 spin_lock(&swap_lock);
2679 p->flags = 0;
2680 spin_unlock(&swap_lock);
2681
2682 err = 0;
2683 atomic_inc(&proc_poll_event);
2684 wake_up_interruptible(&proc_poll_wait);
2685
2686 out_dput:
2687 filp_close(victim, NULL);
2688 out:
2689 putname(pathname);
2690 return err;
2691 }
2692
2693 #ifdef CONFIG_PROC_FS
2694 static __poll_t swaps_poll(struct file *file, poll_table *wait)
2695 {
2696 struct seq_file *seq = file->private_data;
2697
2698 poll_wait(file, &proc_poll_wait, wait);
2699
2700 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2701 seq->poll_event = atomic_read(&proc_poll_event);
2702 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2703 }
2704
2705 return EPOLLIN | EPOLLRDNORM;
2706 }
2707
2708
2709 static void *swap_start(struct seq_file *swap, loff_t *pos)
2710 {
2711 struct swap_info_struct *si;
2712 int type;
2713 loff_t l = *pos;
2714
2715 mutex_lock(&swapon_mutex);
2716
2717 if (!l)
2718 return SEQ_START_TOKEN;
2719
2720 for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2721 if (!(si->flags & SWP_USED) || !si->swap_map)
2722 continue;
2723 if (!--l)
2724 return si;
2725 }
2726
2727 return NULL;
2728 }
2729
2730 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2731 {
2732 struct swap_info_struct *si = v;
2733 int type;
2734
2735 if (v == SEQ_START_TOKEN)
2736 type = 0;
2737 else
2738 type = si->type + 1;
2739
2740 for (; (si = swap_type_to_swap_info(type)); type++) {
2741 if (!(si->flags & SWP_USED) || !si->swap_map)
2742 continue;
2743 ++*pos;
2744 return si;
2745 }
2746
2747 return NULL;
2748 }
2749
2750 static void swap_stop(struct seq_file *swap, void *v)
2751 {
2752 mutex_unlock(&swapon_mutex);
2753 }
2754
2755 static int swap_show(struct seq_file *swap, void *v)
2756 {
2757 struct swap_info_struct *si = v;
2758 struct file *file;
2759 int len;
2760
2761 if (si == SEQ_START_TOKEN) {
2762 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2763 return 0;
2764 }
2765
2766 file = si->swap_file;
2767 len = seq_file_path(swap, file, " \t\n\\");
2768 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2769 len < 40 ? 40 - len : 1, " ",
2770 S_ISBLK(file_inode(file)->i_mode) ?
2771 "partition" : "file\t",
2772 si->pages << (PAGE_SHIFT - 10),
2773 si->inuse_pages << (PAGE_SHIFT - 10),
2774 si->prio);
2775 return 0;
2776 }
2777
2778 static const struct seq_operations swaps_op = {
2779 .start = swap_start,
2780 .next = swap_next,
2781 .stop = swap_stop,
2782 .show = swap_show
2783 };
2784
2785 static int swaps_open(struct inode *inode, struct file *file)
2786 {
2787 struct seq_file *seq;
2788 int ret;
2789
2790 ret = seq_open(file, &swaps_op);
2791 if (ret)
2792 return ret;
2793
2794 seq = file->private_data;
2795 seq->poll_event = atomic_read(&proc_poll_event);
2796 return 0;
2797 }
2798
2799 static const struct file_operations proc_swaps_operations = {
2800 .open = swaps_open,
2801 .read = seq_read,
2802 .llseek = seq_lseek,
2803 .release = seq_release,
2804 .poll = swaps_poll,
2805 };
2806
2807 static int __init procswaps_init(void)
2808 {
2809 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2810 return 0;
2811 }
2812 __initcall(procswaps_init);
2813 #endif
2814
2815 #ifdef MAX_SWAPFILES_CHECK
2816 static int __init max_swapfiles_check(void)
2817 {
2818 MAX_SWAPFILES_CHECK();
2819 return 0;
2820 }
2821 late_initcall(max_swapfiles_check);
2822 #endif
2823
2824 static struct swap_info_struct *alloc_swap_info(void)
2825 {
2826 struct swap_info_struct *p;
2827 unsigned int type;
2828 int i;
2829
2830 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2831 if (!p)
2832 return ERR_PTR(-ENOMEM);
2833
2834 spin_lock(&swap_lock);
2835 for (type = 0; type < nr_swapfiles; type++) {
2836 if (!(swap_info[type]->flags & SWP_USED))
2837 break;
2838 }
2839 if (type >= MAX_SWAPFILES) {
2840 spin_unlock(&swap_lock);
2841 kvfree(p);
2842 return ERR_PTR(-EPERM);
2843 }
2844 if (type >= nr_swapfiles) {
2845 p->type = type;
2846 WRITE_ONCE(swap_info[type], p);
2847
2848
2849
2850
2851
2852 smp_wmb();
2853 WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
2854 } else {
2855 kvfree(p);
2856 p = swap_info[type];
2857
2858
2859
2860
2861 }
2862 p->swap_extent_root = RB_ROOT;
2863 plist_node_init(&p->list, 0);
2864 for_each_node(i)
2865 plist_node_init(&p->avail_lists[i], 0);
2866 p->flags = SWP_USED;
2867 spin_unlock(&swap_lock);
2868 spin_lock_init(&p->lock);
2869 spin_lock_init(&p->cont_lock);
2870
2871 return p;
2872 }
2873
2874 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2875 {
2876 int error;
2877
2878 if (S_ISBLK(inode->i_mode)) {
2879 p->bdev = bdgrab(I_BDEV(inode));
2880 error = blkdev_get(p->bdev,
2881 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2882 if (error < 0) {
2883 p->bdev = NULL;
2884 return error;
2885 }
2886 p->old_block_size = block_size(p->bdev);
2887 error = set_blocksize(p->bdev, PAGE_SIZE);
2888 if (error < 0)
2889 return error;
2890 p->flags |= SWP_BLKDEV;
2891 } else if (S_ISREG(inode->i_mode)) {
2892 p->bdev = inode->i_sb->s_bdev;
2893 }
2894
2895 return 0;
2896 }
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915 unsigned long generic_max_swapfile_size(void)
2916 {
2917 return swp_offset(pte_to_swp_entry(
2918 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2919 }
2920
2921
2922 __weak unsigned long max_swapfile_size(void)
2923 {
2924 return generic_max_swapfile_size();
2925 }
2926
2927 static unsigned long read_swap_header(struct swap_info_struct *p,
2928 union swap_header *swap_header,
2929 struct inode *inode)
2930 {
2931 int i;
2932 unsigned long maxpages;
2933 unsigned long swapfilepages;
2934 unsigned long last_page;
2935
2936 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2937 pr_err("Unable to find swap-space signature\n");
2938 return 0;
2939 }
2940
2941
2942 if (swab32(swap_header->info.version) == 1) {
2943 swab32s(&swap_header->info.version);
2944 swab32s(&swap_header->info.last_page);
2945 swab32s(&swap_header->info.nr_badpages);
2946 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2947 return 0;
2948 for (i = 0; i < swap_header->info.nr_badpages; i++)
2949 swab32s(&swap_header->info.badpages[i]);
2950 }
2951
2952 if (swap_header->info.version != 1) {
2953 pr_warn("Unable to handle swap header version %d\n",
2954 swap_header->info.version);
2955 return 0;
2956 }
2957
2958 p->lowest_bit = 1;
2959 p->cluster_next = 1;
2960 p->cluster_nr = 0;
2961
2962 maxpages = max_swapfile_size();
2963 last_page = swap_header->info.last_page;
2964 if (!last_page) {
2965 pr_warn("Empty swap-file\n");
2966 return 0;
2967 }
2968 if (last_page > maxpages) {
2969 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2970 maxpages << (PAGE_SHIFT - 10),
2971 last_page << (PAGE_SHIFT - 10));
2972 }
2973 if (maxpages > last_page) {
2974 maxpages = last_page + 1;
2975
2976 if ((unsigned int)maxpages == 0)
2977 maxpages = UINT_MAX;
2978 }
2979 p->highest_bit = maxpages - 1;
2980
2981 if (!maxpages)
2982 return 0;
2983 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2984 if (swapfilepages && maxpages > swapfilepages) {
2985 pr_warn("Swap area shorter than signature indicates\n");
2986 return 0;
2987 }
2988 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2989 return 0;
2990 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2991 return 0;
2992
2993 return maxpages;
2994 }
2995
2996 #define SWAP_CLUSTER_INFO_COLS \
2997 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2998 #define SWAP_CLUSTER_SPACE_COLS \
2999 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
3000 #define SWAP_CLUSTER_COLS \
3001 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
3002
3003 static int setup_swap_map_and_extents(struct swap_info_struct *p,
3004 union swap_header *swap_header,
3005 unsigned char *swap_map,
3006 struct swap_cluster_info *cluster_info,
3007 unsigned long maxpages,
3008 sector_t *span)
3009 {
3010 unsigned int j, k;
3011 unsigned int nr_good_pages;
3012 int nr_extents;
3013 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3014 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3015 unsigned long i, idx;
3016
3017 nr_good_pages = maxpages - 1;
3018
3019 cluster_list_init(&p->free_clusters);
3020 cluster_list_init(&p->discard_clusters);
3021
3022 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3023 unsigned int page_nr = swap_header->info.badpages[i];
3024 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3025 return -EINVAL;
3026 if (page_nr < maxpages) {
3027 swap_map[page_nr] = SWAP_MAP_BAD;
3028 nr_good_pages--;
3029
3030
3031
3032
3033 inc_cluster_info_page(p, cluster_info, page_nr);
3034 }
3035 }
3036
3037
3038 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3039 inc_cluster_info_page(p, cluster_info, i);
3040
3041 if (nr_good_pages) {
3042 swap_map[0] = SWAP_MAP_BAD;
3043
3044
3045
3046
3047 inc_cluster_info_page(p, cluster_info, 0);
3048 p->max = maxpages;
3049 p->pages = nr_good_pages;
3050 nr_extents = setup_swap_extents(p, span);
3051 if (nr_extents < 0)
3052 return nr_extents;
3053 nr_good_pages = p->pages;
3054 }
3055 if (!nr_good_pages) {
3056 pr_warn("Empty swap-file\n");
3057 return -EINVAL;
3058 }
3059
3060 if (!cluster_info)
3061 return nr_extents;
3062
3063
3064
3065
3066
3067
3068 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3069 j = (k + col) % SWAP_CLUSTER_COLS;
3070 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3071 idx = i * SWAP_CLUSTER_COLS + j;
3072 if (idx >= nr_clusters)
3073 continue;
3074 if (cluster_count(&cluster_info[idx]))
3075 continue;
3076 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3077 cluster_list_add_tail(&p->free_clusters, cluster_info,
3078 idx);
3079 }
3080 }
3081 return nr_extents;
3082 }
3083
3084
3085
3086
3087
3088 static bool swap_discardable(struct swap_info_struct *si)
3089 {
3090 struct request_queue *q = bdev_get_queue(si->bdev);
3091
3092 if (!q || !blk_queue_discard(q))
3093 return false;
3094
3095 return true;
3096 }
3097
3098 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3099 {
3100 struct swap_info_struct *p;
3101 struct filename *name;
3102 struct file *swap_file = NULL;
3103 struct address_space *mapping;
3104 int prio;
3105 int error;
3106 union swap_header *swap_header;
3107 int nr_extents;
3108 sector_t span;
3109 unsigned long maxpages;
3110 unsigned char *swap_map = NULL;
3111 struct swap_cluster_info *cluster_info = NULL;
3112 unsigned long *frontswap_map = NULL;
3113 struct page *page = NULL;
3114 struct inode *inode = NULL;
3115 bool inced_nr_rotate_swap = false;
3116
3117 if (swap_flags & ~SWAP_FLAGS_VALID)
3118 return -EINVAL;
3119
3120 if (!capable(CAP_SYS_ADMIN))
3121 return -EPERM;
3122
3123 if (!swap_avail_heads)
3124 return -ENOMEM;
3125
3126 p = alloc_swap_info();
3127 if (IS_ERR(p))
3128 return PTR_ERR(p);
3129
3130 INIT_WORK(&p->discard_work, swap_discard_work);
3131
3132 name = getname(specialfile);
3133 if (IS_ERR(name)) {
3134 error = PTR_ERR(name);
3135 name = NULL;
3136 goto bad_swap;
3137 }
3138 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3139 if (IS_ERR(swap_file)) {
3140 error = PTR_ERR(swap_file);
3141 swap_file = NULL;
3142 goto bad_swap;
3143 }
3144
3145 p->swap_file = swap_file;
3146 mapping = swap_file->f_mapping;
3147 inode = mapping->host;
3148
3149 error = claim_swapfile(p, inode);
3150 if (unlikely(error))
3151 goto bad_swap;
3152
3153 inode_lock(inode);
3154 if (IS_SWAPFILE(inode)) {
3155 error = -EBUSY;
3156 goto bad_swap_unlock_inode;
3157 }
3158
3159
3160
3161
3162 if (!mapping->a_ops->readpage) {
3163 error = -EINVAL;
3164 goto bad_swap_unlock_inode;
3165 }
3166 page = read_mapping_page(mapping, 0, swap_file);
3167 if (IS_ERR(page)) {
3168 error = PTR_ERR(page);
3169 goto bad_swap;
3170 }
3171 swap_header = kmap(page);
3172
3173 maxpages = read_swap_header(p, swap_header, inode);
3174 if (unlikely(!maxpages)) {
3175 error = -EINVAL;
3176 goto bad_swap_unlock_inode;
3177 }
3178
3179
3180 swap_map = vzalloc(maxpages);
3181 if (!swap_map) {
3182 error = -ENOMEM;
3183 goto bad_swap_unlock_inode;
3184 }
3185
3186 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3187 p->flags |= SWP_STABLE_WRITES;
3188
3189 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3190 p->flags |= SWP_SYNCHRONOUS_IO;
3191
3192 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3193 int cpu;
3194 unsigned long ci, nr_cluster;
3195
3196 p->flags |= SWP_SOLIDSTATE;
3197
3198
3199
3200
3201 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3202 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3203
3204 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3205 GFP_KERNEL);
3206 if (!cluster_info) {
3207 error = -ENOMEM;
3208 goto bad_swap_unlock_inode;
3209 }
3210
3211 for (ci = 0; ci < nr_cluster; ci++)
3212 spin_lock_init(&((cluster_info + ci)->lock));
3213
3214 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3215 if (!p->percpu_cluster) {
3216 error = -ENOMEM;
3217 goto bad_swap_unlock_inode;
3218 }
3219 for_each_possible_cpu(cpu) {
3220 struct percpu_cluster *cluster;
3221 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3222 cluster_set_null(&cluster->index);
3223 }
3224 } else {
3225 atomic_inc(&nr_rotate_swap);
3226 inced_nr_rotate_swap = true;
3227 }
3228
3229 error = swap_cgroup_swapon(p->type, maxpages);
3230 if (error)
3231 goto bad_swap_unlock_inode;
3232
3233 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3234 cluster_info, maxpages, &span);
3235 if (unlikely(nr_extents < 0)) {
3236 error = nr_extents;
3237 goto bad_swap_unlock_inode;
3238 }
3239
3240 if (IS_ENABLED(CONFIG_FRONTSWAP))
3241 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3242 sizeof(long),
3243 GFP_KERNEL);
3244
3245 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3246
3247
3248
3249
3250
3251
3252 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3253 SWP_PAGE_DISCARD);
3254
3255
3256
3257
3258
3259
3260
3261 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3262 p->flags &= ~SWP_PAGE_DISCARD;
3263 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3264 p->flags &= ~SWP_AREA_DISCARD;
3265
3266
3267 if (p->flags & SWP_AREA_DISCARD) {
3268 int err = discard_swap(p);
3269 if (unlikely(err))
3270 pr_err("swapon: discard_swap(%p): %d\n",
3271 p, err);
3272 }
3273 }
3274
3275 error = init_swap_address_space(p->type, maxpages);
3276 if (error)
3277 goto bad_swap_unlock_inode;
3278
3279
3280
3281
3282
3283 inode->i_flags |= S_SWAPFILE;
3284 error = inode_drain_writes(inode);
3285 if (error) {
3286 inode->i_flags &= ~S_SWAPFILE;
3287 goto bad_swap_unlock_inode;
3288 }
3289
3290 mutex_lock(&swapon_mutex);
3291 prio = -1;
3292 if (swap_flags & SWAP_FLAG_PREFER)
3293 prio =
3294 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3295 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3296
3297 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3298 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3299 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3300 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3301 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3302 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3303 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3304 (frontswap_map) ? "FS" : "");
3305
3306 mutex_unlock(&swapon_mutex);
3307 atomic_inc(&proc_poll_event);
3308 wake_up_interruptible(&proc_poll_wait);
3309
3310 error = 0;
3311 goto out;
3312 bad_swap_unlock_inode:
3313 inode_unlock(inode);
3314 bad_swap:
3315 free_percpu(p->percpu_cluster);
3316 p->percpu_cluster = NULL;
3317 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3318 set_blocksize(p->bdev, p->old_block_size);
3319 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3320 }
3321 inode = NULL;
3322 destroy_swap_extents(p);
3323 swap_cgroup_swapoff(p->type);
3324 spin_lock(&swap_lock);
3325 p->swap_file = NULL;
3326 p->flags = 0;
3327 spin_unlock(&swap_lock);
3328 vfree(swap_map);
3329 kvfree(cluster_info);
3330 kvfree(frontswap_map);
3331 if (inced_nr_rotate_swap)
3332 atomic_dec(&nr_rotate_swap);
3333 if (swap_file)
3334 filp_close(swap_file, NULL);
3335 out:
3336 if (page && !IS_ERR(page)) {
3337 kunmap(page);
3338 put_page(page);
3339 }
3340 if (name)
3341 putname(name);
3342 if (inode)
3343 inode_unlock(inode);
3344 if (!error)
3345 enable_swap_slots_cache();
3346 return error;
3347 }
3348
3349 void si_swapinfo(struct sysinfo *val)
3350 {
3351 unsigned int type;
3352 unsigned long nr_to_be_unused = 0;
3353
3354 spin_lock(&swap_lock);
3355 for (type = 0; type < nr_swapfiles; type++) {
3356 struct swap_info_struct *si = swap_info[type];
3357
3358 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3359 nr_to_be_unused += si->inuse_pages;
3360 }
3361 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3362 val->totalswap = total_swap_pages + nr_to_be_unused;
3363 spin_unlock(&swap_lock);
3364 }
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3378 {
3379 struct swap_info_struct *p;
3380 struct swap_cluster_info *ci;
3381 unsigned long offset;
3382 unsigned char count;
3383 unsigned char has_cache;
3384 int err = -EINVAL;
3385
3386 p = get_swap_device(entry);
3387 if (!p)
3388 goto out;
3389
3390 offset = swp_offset(entry);
3391 ci = lock_cluster_or_swap_info(p, offset);
3392
3393 count = p->swap_map[offset];
3394
3395
3396
3397
3398
3399 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3400 err = -ENOENT;
3401 goto unlock_out;
3402 }
3403
3404 has_cache = count & SWAP_HAS_CACHE;
3405 count &= ~SWAP_HAS_CACHE;
3406 err = 0;
3407
3408 if (usage == SWAP_HAS_CACHE) {
3409
3410
3411 if (!has_cache && count)
3412 has_cache = SWAP_HAS_CACHE;
3413 else if (has_cache)
3414 err = -EEXIST;
3415 else
3416 err = -ENOENT;
3417
3418 } else if (count || has_cache) {
3419
3420 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3421 count += usage;
3422 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3423 err = -EINVAL;
3424 else if (swap_count_continued(p, offset, count))
3425 count = COUNT_CONTINUED;
3426 else
3427 err = -ENOMEM;
3428 } else
3429 err = -ENOENT;
3430
3431 p->swap_map[offset] = count | has_cache;
3432
3433 unlock_out:
3434 unlock_cluster_or_swap_info(p, ci);
3435 out:
3436 if (p)
3437 put_swap_device(p);
3438 return err;
3439 }
3440
3441
3442
3443
3444
3445 void swap_shmem_alloc(swp_entry_t entry)
3446 {
3447 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3448 }
3449
3450
3451
3452
3453
3454
3455
3456
3457 int swap_duplicate(swp_entry_t entry)
3458 {
3459 int err = 0;
3460
3461 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3462 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3463 return err;
3464 }
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474 int swapcache_prepare(swp_entry_t entry)
3475 {
3476 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3477 }
3478
3479 struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3480 {
3481 return swap_type_to_swap_info(swp_type(entry));
3482 }
3483
3484 struct swap_info_struct *page_swap_info(struct page *page)
3485 {
3486 swp_entry_t entry = { .val = page_private(page) };
3487 return swp_swap_info(entry);
3488 }
3489
3490
3491
3492
3493 struct address_space *__page_file_mapping(struct page *page)
3494 {
3495 return page_swap_info(page)->swap_file->f_mapping;
3496 }
3497 EXPORT_SYMBOL_GPL(__page_file_mapping);
3498
3499 pgoff_t __page_file_index(struct page *page)
3500 {
3501 swp_entry_t swap = { .val = page_private(page) };
3502 return swp_offset(swap);
3503 }
3504 EXPORT_SYMBOL_GPL(__page_file_index);
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3522 {
3523 struct swap_info_struct *si;
3524 struct swap_cluster_info *ci;
3525 struct page *head;
3526 struct page *page;
3527 struct page *list_page;
3528 pgoff_t offset;
3529 unsigned char count;
3530 int ret = 0;
3531
3532
3533
3534
3535
3536 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3537
3538 si = get_swap_device(entry);
3539 if (!si) {
3540
3541
3542
3543
3544 goto outer;
3545 }
3546 spin_lock(&si->lock);
3547
3548 offset = swp_offset(entry);
3549
3550 ci = lock_cluster(si, offset);
3551
3552 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3553
3554 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3555
3556
3557
3558
3559
3560 goto out;
3561 }
3562
3563 if (!page) {
3564 ret = -ENOMEM;
3565 goto out;
3566 }
3567
3568
3569
3570
3571
3572
3573 head = vmalloc_to_page(si->swap_map + offset);
3574 offset &= ~PAGE_MASK;
3575
3576 spin_lock(&si->cont_lock);
3577
3578
3579
3580
3581 if (!page_private(head)) {
3582 BUG_ON(count & COUNT_CONTINUED);
3583 INIT_LIST_HEAD(&head->lru);
3584 set_page_private(head, SWP_CONTINUED);
3585 si->flags |= SWP_CONTINUED;
3586 }
3587
3588 list_for_each_entry(list_page, &head->lru, lru) {
3589 unsigned char *map;
3590
3591
3592
3593
3594
3595 if (!(count & COUNT_CONTINUED))
3596 goto out_unlock_cont;
3597
3598 map = kmap_atomic(list_page) + offset;
3599 count = *map;
3600 kunmap_atomic(map);
3601
3602
3603
3604
3605
3606 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3607 goto out_unlock_cont;
3608 }
3609
3610 list_add_tail(&page->lru, &head->lru);
3611 page = NULL;
3612 out_unlock_cont:
3613 spin_unlock(&si->cont_lock);
3614 out:
3615 unlock_cluster(ci);
3616 spin_unlock(&si->lock);
3617 put_swap_device(si);
3618 outer:
3619 if (page)
3620 __free_page(page);
3621 return ret;
3622 }
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633 static bool swap_count_continued(struct swap_info_struct *si,
3634 pgoff_t offset, unsigned char count)
3635 {
3636 struct page *head;
3637 struct page *page;
3638 unsigned char *map;
3639 bool ret;
3640
3641 head = vmalloc_to_page(si->swap_map + offset);
3642 if (page_private(head) != SWP_CONTINUED) {
3643 BUG_ON(count & COUNT_CONTINUED);
3644 return false;
3645 }
3646
3647 spin_lock(&si->cont_lock);
3648 offset &= ~PAGE_MASK;
3649 page = list_entry(head->lru.next, struct page, lru);
3650 map = kmap_atomic(page) + offset;
3651
3652 if (count == SWAP_MAP_MAX)
3653 goto init_map;
3654
3655 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3656
3657
3658
3659 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3660 kunmap_atomic(map);
3661 page = list_entry(page->lru.next, struct page, lru);
3662 BUG_ON(page == head);
3663 map = kmap_atomic(page) + offset;
3664 }
3665 if (*map == SWAP_CONT_MAX) {
3666 kunmap_atomic(map);
3667 page = list_entry(page->lru.next, struct page, lru);
3668 if (page == head) {
3669 ret = false;
3670 goto out;
3671 }
3672 map = kmap_atomic(page) + offset;
3673 init_map: *map = 0;
3674 }
3675 *map += 1;
3676 kunmap_atomic(map);
3677 page = list_entry(page->lru.prev, struct page, lru);
3678 while (page != head) {
3679 map = kmap_atomic(page) + offset;
3680 *map = COUNT_CONTINUED;
3681 kunmap_atomic(map);
3682 page = list_entry(page->lru.prev, struct page, lru);
3683 }
3684 ret = true;
3685
3686 } else {
3687
3688
3689
3690 BUG_ON(count != COUNT_CONTINUED);
3691 while (*map == COUNT_CONTINUED) {
3692 kunmap_atomic(map);
3693 page = list_entry(page->lru.next, struct page, lru);
3694 BUG_ON(page == head);
3695 map = kmap_atomic(page) + offset;
3696 }
3697 BUG_ON(*map == 0);
3698 *map -= 1;
3699 if (*map == 0)
3700 count = 0;
3701 kunmap_atomic(map);
3702 page = list_entry(page->lru.prev, struct page, lru);
3703 while (page != head) {
3704 map = kmap_atomic(page) + offset;
3705 *map = SWAP_CONT_MAX | count;
3706 count = COUNT_CONTINUED;
3707 kunmap_atomic(map);
3708 page = list_entry(page->lru.prev, struct page, lru);
3709 }
3710 ret = count == COUNT_CONTINUED;
3711 }
3712 out:
3713 spin_unlock(&si->cont_lock);
3714 return ret;
3715 }
3716
3717
3718
3719
3720
3721 static void free_swap_count_continuations(struct swap_info_struct *si)
3722 {
3723 pgoff_t offset;
3724
3725 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3726 struct page *head;
3727 head = vmalloc_to_page(si->swap_map + offset);
3728 if (page_private(head)) {
3729 struct page *page, *next;
3730
3731 list_for_each_entry_safe(page, next, &head->lru, lru) {
3732 list_del(&page->lru);
3733 __free_page(page);
3734 }
3735 }
3736 }
3737 }
3738
3739 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3740 void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3741 gfp_t gfp_mask)
3742 {
3743 struct swap_info_struct *si, *next;
3744 if (!(gfp_mask & __GFP_IO) || !memcg)
3745 return;
3746
3747 if (!blk_cgroup_congested())
3748 return;
3749
3750
3751
3752
3753
3754 if (current->throttle_queue)
3755 return;
3756
3757 spin_lock(&swap_avail_lock);
3758 plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3759 avail_lists[node]) {
3760 if (si->bdev) {
3761 blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3762 true);
3763 break;
3764 }
3765 }
3766 spin_unlock(&swap_avail_lock);
3767 }
3768 #endif
3769
3770 static int __init swapfile_init(void)
3771 {
3772 int nid;
3773
3774 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3775 GFP_KERNEL);
3776 if (!swap_avail_heads) {
3777 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3778 return -ENOMEM;
3779 }
3780
3781 for_each_node(nid)
3782 plist_head_init(&swap_avail_heads[nid]);
3783
3784 return 0;
3785 }
3786 subsys_initcall(swapfile_init);