1/* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of version 2.1 of the GNU Lesser General Public License 8 * as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it would be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 * 14 */ 15 16#include <linux/cgroup.h> 17#include <linux/page_counter.h> 18#include <linux/slab.h> 19#include <linux/hugetlb.h> 20#include <linux/hugetlb_cgroup.h> 21 22struct hugetlb_cgroup { 23 struct cgroup_subsys_state css; 24 /* 25 * the counter to account for hugepages from hugetlb. 26 */ 27 struct page_counter hugepage[HUGE_MAX_HSTATE]; 28}; 29 30#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 31#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 32#define MEMFILE_ATTR(val) ((val) & 0xffff) 33 34static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 35 36static inline 37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 38{ 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 40} 41 42static inline 43struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 44{ 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 46} 47 48static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 49{ 50 return (h_cg == root_h_cgroup); 51} 52 53static inline struct hugetlb_cgroup * 54parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 55{ 56 return hugetlb_cgroup_from_css(h_cg->css.parent); 57} 58 59static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 60{ 61 int idx; 62 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 64 if (page_counter_read(&h_cg->hugepage[idx])) 65 return true; 66 } 67 return false; 68} 69 70static struct cgroup_subsys_state * 71hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 72{ 73 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 74 struct hugetlb_cgroup *h_cgroup; 75 int idx; 76 77 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 78 if (!h_cgroup) 79 return ERR_PTR(-ENOMEM); 80 81 if (parent_h_cgroup) { 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 83 page_counter_init(&h_cgroup->hugepage[idx], 84 &parent_h_cgroup->hugepage[idx]); 85 } else { 86 root_h_cgroup = h_cgroup; 87 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 88 page_counter_init(&h_cgroup->hugepage[idx], NULL); 89 } 90 return &h_cgroup->css; 91} 92 93static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 94{ 95 struct hugetlb_cgroup *h_cgroup; 96 97 h_cgroup = hugetlb_cgroup_from_css(css); 98 kfree(h_cgroup); 99} 100 101 102/* 103 * Should be called with hugetlb_lock held. 104 * Since we are holding hugetlb_lock, pages cannot get moved from 105 * active list or uncharged from the cgroup, So no need to get 106 * page reference and test for page active here. This function 107 * cannot fail. 108 */ 109static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 110 struct page *page) 111{ 112 unsigned int nr_pages; 113 struct page_counter *counter; 114 struct hugetlb_cgroup *page_hcg; 115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 116 117 page_hcg = hugetlb_cgroup_from_page(page); 118 /* 119 * We can have pages in active list without any cgroup 120 * ie, hugepage with less than 3 pages. We can safely 121 * ignore those pages. 122 */ 123 if (!page_hcg || page_hcg != h_cg) 124 goto out; 125 126 nr_pages = 1 << compound_order(page); 127 if (!parent) { 128 parent = root_h_cgroup; 129 /* root has no limit */ 130 page_counter_charge(&parent->hugepage[idx], nr_pages); 131 } 132 counter = &h_cg->hugepage[idx]; 133 /* Take the pages off the local counter */ 134 page_counter_cancel(counter, nr_pages); 135 136 set_hugetlb_cgroup(page, parent); 137out: 138 return; 139} 140 141/* 142 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 143 * the parent cgroup. 144 */ 145static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 146{ 147 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 148 struct hstate *h; 149 struct page *page; 150 int idx = 0; 151 152 do { 153 for_each_hstate(h) { 154 spin_lock(&hugetlb_lock); 155 list_for_each_entry(page, &h->hugepage_activelist, lru) 156 hugetlb_cgroup_move_parent(idx, h_cg, page); 157 158 spin_unlock(&hugetlb_lock); 159 idx++; 160 } 161 cond_resched(); 162 } while (hugetlb_cgroup_have_usage(h_cg)); 163} 164 165int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 166 struct hugetlb_cgroup **ptr) 167{ 168 int ret = 0; 169 struct page_counter *counter; 170 struct hugetlb_cgroup *h_cg = NULL; 171 172 if (hugetlb_cgroup_disabled()) 173 goto done; 174 /* 175 * We don't charge any cgroup if the compound page have less 176 * than 3 pages. 177 */ 178 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 179 goto done; 180again: 181 rcu_read_lock(); 182 h_cg = hugetlb_cgroup_from_task(current); 183 if (!css_tryget_online(&h_cg->css)) { 184 rcu_read_unlock(); 185 goto again; 186 } 187 rcu_read_unlock(); 188 189 ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); 190 css_put(&h_cg->css); 191done: 192 *ptr = h_cg; 193 return ret; 194} 195 196/* Should be called with hugetlb_lock held */ 197void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 198 struct hugetlb_cgroup *h_cg, 199 struct page *page) 200{ 201 if (hugetlb_cgroup_disabled() || !h_cg) 202 return; 203 204 set_hugetlb_cgroup(page, h_cg); 205 return; 206} 207 208/* 209 * Should be called with hugetlb_lock held 210 */ 211void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 212 struct page *page) 213{ 214 struct hugetlb_cgroup *h_cg; 215 216 if (hugetlb_cgroup_disabled()) 217 return; 218 lockdep_assert_held(&hugetlb_lock); 219 h_cg = hugetlb_cgroup_from_page(page); 220 if (unlikely(!h_cg)) 221 return; 222 set_hugetlb_cgroup(page, NULL); 223 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 224 return; 225} 226 227void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 228 struct hugetlb_cgroup *h_cg) 229{ 230 if (hugetlb_cgroup_disabled() || !h_cg) 231 return; 232 233 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 234 return; 235 236 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 237 return; 238} 239 240enum { 241 RES_USAGE, 242 RES_LIMIT, 243 RES_MAX_USAGE, 244 RES_FAILCNT, 245}; 246 247static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 248 struct cftype *cft) 249{ 250 struct page_counter *counter; 251 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 252 253 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 254 255 switch (MEMFILE_ATTR(cft->private)) { 256 case RES_USAGE: 257 return (u64)page_counter_read(counter) * PAGE_SIZE; 258 case RES_LIMIT: 259 return (u64)counter->limit * PAGE_SIZE; 260 case RES_MAX_USAGE: 261 return (u64)counter->watermark * PAGE_SIZE; 262 case RES_FAILCNT: 263 return counter->failcnt; 264 default: 265 BUG(); 266 } 267} 268 269static DEFINE_MUTEX(hugetlb_limit_mutex); 270 271static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 272 char *buf, size_t nbytes, loff_t off) 273{ 274 int ret, idx; 275 unsigned long nr_pages; 276 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 277 278 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 279 return -EINVAL; 280 281 buf = strstrip(buf); 282 ret = page_counter_memparse(buf, "-1", &nr_pages); 283 if (ret) 284 return ret; 285 286 idx = MEMFILE_IDX(of_cft(of)->private); 287 288 switch (MEMFILE_ATTR(of_cft(of)->private)) { 289 case RES_LIMIT: 290 mutex_lock(&hugetlb_limit_mutex); 291 ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); 292 mutex_unlock(&hugetlb_limit_mutex); 293 break; 294 default: 295 ret = -EINVAL; 296 break; 297 } 298 return ret ?: nbytes; 299} 300 301static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 302 char *buf, size_t nbytes, loff_t off) 303{ 304 int ret = 0; 305 struct page_counter *counter; 306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 307 308 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 309 310 switch (MEMFILE_ATTR(of_cft(of)->private)) { 311 case RES_MAX_USAGE: 312 page_counter_reset_watermark(counter); 313 break; 314 case RES_FAILCNT: 315 counter->failcnt = 0; 316 break; 317 default: 318 ret = -EINVAL; 319 break; 320 } 321 return ret ?: nbytes; 322} 323 324static char *mem_fmt(char *buf, int size, unsigned long hsize) 325{ 326 if (hsize >= (1UL << 30)) 327 snprintf(buf, size, "%luGB", hsize >> 30); 328 else if (hsize >= (1UL << 20)) 329 snprintf(buf, size, "%luMB", hsize >> 20); 330 else 331 snprintf(buf, size, "%luKB", hsize >> 10); 332 return buf; 333} 334 335static void __init __hugetlb_cgroup_file_init(int idx) 336{ 337 char buf[32]; 338 struct cftype *cft; 339 struct hstate *h = &hstates[idx]; 340 341 /* format the size */ 342 mem_fmt(buf, 32, huge_page_size(h)); 343 344 /* Add the limit file */ 345 cft = &h->cgroup_files[0]; 346 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 347 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 348 cft->read_u64 = hugetlb_cgroup_read_u64; 349 cft->write = hugetlb_cgroup_write; 350 351 /* Add the usage file */ 352 cft = &h->cgroup_files[1]; 353 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 354 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 355 cft->read_u64 = hugetlb_cgroup_read_u64; 356 357 /* Add the MAX usage file */ 358 cft = &h->cgroup_files[2]; 359 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 360 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 361 cft->write = hugetlb_cgroup_reset; 362 cft->read_u64 = hugetlb_cgroup_read_u64; 363 364 /* Add the failcntfile */ 365 cft = &h->cgroup_files[3]; 366 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 367 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 368 cft->write = hugetlb_cgroup_reset; 369 cft->read_u64 = hugetlb_cgroup_read_u64; 370 371 /* NULL terminate the last cft */ 372 cft = &h->cgroup_files[4]; 373 memset(cft, 0, sizeof(*cft)); 374 375 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 376 h->cgroup_files)); 377} 378 379void __init hugetlb_cgroup_file_init(void) 380{ 381 struct hstate *h; 382 383 for_each_hstate(h) { 384 /* 385 * Add cgroup control files only if the huge page consists 386 * of more than two normal pages. This is because we use 387 * page[2].lru.next for storing cgroup details. 388 */ 389 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 390 __hugetlb_cgroup_file_init(hstate_index(h)); 391 } 392} 393 394/* 395 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 396 * when we migrate hugepages 397 */ 398void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) 399{ 400 struct hugetlb_cgroup *h_cg; 401 struct hstate *h = page_hstate(oldhpage); 402 403 if (hugetlb_cgroup_disabled()) 404 return; 405 406 VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); 407 spin_lock(&hugetlb_lock); 408 h_cg = hugetlb_cgroup_from_page(oldhpage); 409 set_hugetlb_cgroup(oldhpage, NULL); 410 411 /* move the h_cg details to new cgroup */ 412 set_hugetlb_cgroup(newhpage, h_cg); 413 list_move(&newhpage->lru, &h->hugepage_activelist); 414 spin_unlock(&hugetlb_lock); 415 return; 416} 417 418struct cgroup_subsys hugetlb_cgrp_subsys = { 419 .css_alloc = hugetlb_cgroup_css_alloc, 420 .css_offline = hugetlb_cgroup_css_offline, 421 .css_free = hugetlb_cgroup_css_free, 422}; 423