1/*
2 * Lockless hierarchical page accounting & limiting
3 *
4 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/page_counter.h>
8#include <linux/atomic.h>
9#include <linux/kernel.h>
10#include <linux/string.h>
11#include <linux/sched.h>
12#include <linux/bug.h>
13#include <asm/page.h>
14
15/**
16 * page_counter_cancel - take pages out of the local counter
17 * @counter: counter
18 * @nr_pages: number of pages to cancel
19 */
20void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
21{
22	long new;
23
24	new = atomic_long_sub_return(nr_pages, &counter->count);
25	/* More uncharges than charges? */
26	WARN_ON_ONCE(new < 0);
27}
28
29/**
30 * page_counter_charge - hierarchically charge pages
31 * @counter: counter
32 * @nr_pages: number of pages to charge
33 *
34 * NOTE: This does not consider any configured counter limits.
35 */
36void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
37{
38	struct page_counter *c;
39
40	for (c = counter; c; c = c->parent) {
41		long new;
42
43		new = atomic_long_add_return(nr_pages, &c->count);
44		/*
45		 * This is indeed racy, but we can live with some
46		 * inaccuracy in the watermark.
47		 */
48		if (new > c->watermark)
49			c->watermark = new;
50	}
51}
52
53/**
54 * page_counter_try_charge - try to hierarchically charge pages
55 * @counter: counter
56 * @nr_pages: number of pages to charge
57 * @fail: points first counter to hit its limit, if any
58 *
59 * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
60 * its ancestors has hit its configured limit.
61 */
62int page_counter_try_charge(struct page_counter *counter,
63			    unsigned long nr_pages,
64			    struct page_counter **fail)
65{
66	struct page_counter *c;
67
68	for (c = counter; c; c = c->parent) {
69		long new;
70		/*
71		 * Charge speculatively to avoid an expensive CAS.  If
72		 * a bigger charge fails, it might falsely lock out a
73		 * racing smaller charge and send it into reclaim
74		 * early, but the error is limited to the difference
75		 * between the two sizes, which is less than 2M/4M in
76		 * case of a THP locking out a regular page charge.
77		 *
78		 * The atomic_long_add_return() implies a full memory
79		 * barrier between incrementing the count and reading
80		 * the limit.  When racing with page_counter_limit(),
81		 * we either see the new limit or the setter sees the
82		 * counter has changed and retries.
83		 */
84		new = atomic_long_add_return(nr_pages, &c->count);
85		if (new > c->limit) {
86			atomic_long_sub(nr_pages, &c->count);
87			/*
88			 * This is racy, but we can live with some
89			 * inaccuracy in the failcnt.
90			 */
91			c->failcnt++;
92			*fail = c;
93			goto failed;
94		}
95		/*
96		 * Just like with failcnt, we can live with some
97		 * inaccuracy in the watermark.
98		 */
99		if (new > c->watermark)
100			c->watermark = new;
101	}
102	return 0;
103
104failed:
105	for (c = counter; c != *fail; c = c->parent)
106		page_counter_cancel(c, nr_pages);
107
108	return -ENOMEM;
109}
110
111/**
112 * page_counter_uncharge - hierarchically uncharge pages
113 * @counter: counter
114 * @nr_pages: number of pages to uncharge
115 */
116void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
117{
118	struct page_counter *c;
119
120	for (c = counter; c; c = c->parent)
121		page_counter_cancel(c, nr_pages);
122}
123
124/**
125 * page_counter_limit - limit the number of pages allowed
126 * @counter: counter
127 * @limit: limit to set
128 *
129 * Returns 0 on success, -EBUSY if the current number of pages on the
130 * counter already exceeds the specified limit.
131 *
132 * The caller must serialize invocations on the same counter.
133 */
134int page_counter_limit(struct page_counter *counter, unsigned long limit)
135{
136	for (;;) {
137		unsigned long old;
138		long count;
139
140		/*
141		 * Update the limit while making sure that it's not
142		 * below the concurrently-changing counter value.
143		 *
144		 * The xchg implies two full memory barriers before
145		 * and after, so the read-swap-read is ordered and
146		 * ensures coherency with page_counter_try_charge():
147		 * that function modifies the count before checking
148		 * the limit, so if it sees the old limit, we see the
149		 * modified counter and retry.
150		 */
151		count = atomic_long_read(&counter->count);
152
153		if (count > limit)
154			return -EBUSY;
155
156		old = xchg(&counter->limit, limit);
157
158		if (atomic_long_read(&counter->count) <= count)
159			return 0;
160
161		counter->limit = old;
162		cond_resched();
163	}
164}
165
166/**
167 * page_counter_memparse - memparse() for page counter limits
168 * @buf: string to parse
169 * @max: string meaning maximum possible value
170 * @nr_pages: returns the result in number of pages
171 *
172 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
173 * limited to %PAGE_COUNTER_MAX.
174 */
175int page_counter_memparse(const char *buf, const char *max,
176			  unsigned long *nr_pages)
177{
178	char *end;
179	u64 bytes;
180
181	if (!strcmp(buf, max)) {
182		*nr_pages = PAGE_COUNTER_MAX;
183		return 0;
184	}
185
186	bytes = memparse(buf, &end);
187	if (*end != '\0')
188		return -EINVAL;
189
190	*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
191
192	return 0;
193}
194