1 2#include <linux/ceph/ceph_debug.h> 3 4#include <linux/module.h> 5#include <linux/slab.h> 6#include <asm/div64.h> 7 8#include <linux/ceph/libceph.h> 9#include <linux/ceph/osdmap.h> 10#include <linux/ceph/decode.h> 11#include <linux/crush/hash.h> 12#include <linux/crush/mapper.h> 13 14char *ceph_osdmap_state_str(char *str, int len, int state) 15{ 16 if (!len) 17 return str; 18 19 if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) 20 snprintf(str, len, "exists, up"); 21 else if (state & CEPH_OSD_EXISTS) 22 snprintf(str, len, "exists"); 23 else if (state & CEPH_OSD_UP) 24 snprintf(str, len, "up"); 25 else 26 snprintf(str, len, "doesn't exist"); 27 28 return str; 29} 30 31/* maps */ 32 33static int calc_bits_of(unsigned int t) 34{ 35 int b = 0; 36 while (t) { 37 t = t >> 1; 38 b++; 39 } 40 return b; 41} 42 43/* 44 * the foo_mask is the smallest value 2^n-1 that is >= foo. 45 */ 46static void calc_pg_masks(struct ceph_pg_pool_info *pi) 47{ 48 pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; 49 pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; 50} 51 52/* 53 * decode crush map 54 */ 55static int crush_decode_uniform_bucket(void **p, void *end, 56 struct crush_bucket_uniform *b) 57{ 58 dout("crush_decode_uniform_bucket %p to %p\n", *p, end); 59 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); 60 b->item_weight = ceph_decode_32(p); 61 return 0; 62bad: 63 return -EINVAL; 64} 65 66static int crush_decode_list_bucket(void **p, void *end, 67 struct crush_bucket_list *b) 68{ 69 int j; 70 dout("crush_decode_list_bucket %p to %p\n", *p, end); 71 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 72 if (b->item_weights == NULL) 73 return -ENOMEM; 74 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 75 if (b->sum_weights == NULL) 76 return -ENOMEM; 77 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 78 for (j = 0; j < b->h.size; j++) { 79 b->item_weights[j] = ceph_decode_32(p); 80 b->sum_weights[j] = ceph_decode_32(p); 81 } 82 return 0; 83bad: 84 return -EINVAL; 85} 86 87static int crush_decode_tree_bucket(void **p, void *end, 88 struct crush_bucket_tree *b) 89{ 90 int j; 91 dout("crush_decode_tree_bucket %p to %p\n", *p, end); 92 ceph_decode_8_safe(p, end, b->num_nodes, bad); 93 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); 94 if (b->node_weights == NULL) 95 return -ENOMEM; 96 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); 97 for (j = 0; j < b->num_nodes; j++) 98 b->node_weights[j] = ceph_decode_32(p); 99 return 0; 100bad: 101 return -EINVAL; 102} 103 104static int crush_decode_straw_bucket(void **p, void *end, 105 struct crush_bucket_straw *b) 106{ 107 int j; 108 dout("crush_decode_straw_bucket %p to %p\n", *p, end); 109 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 110 if (b->item_weights == NULL) 111 return -ENOMEM; 112 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 113 if (b->straws == NULL) 114 return -ENOMEM; 115 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 116 for (j = 0; j < b->h.size; j++) { 117 b->item_weights[j] = ceph_decode_32(p); 118 b->straws[j] = ceph_decode_32(p); 119 } 120 return 0; 121bad: 122 return -EINVAL; 123} 124 125static int crush_decode_straw2_bucket(void **p, void *end, 126 struct crush_bucket_straw2 *b) 127{ 128 int j; 129 dout("crush_decode_straw2_bucket %p to %p\n", *p, end); 130 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 131 if (b->item_weights == NULL) 132 return -ENOMEM; 133 ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); 134 for (j = 0; j < b->h.size; j++) 135 b->item_weights[j] = ceph_decode_32(p); 136 return 0; 137bad: 138 return -EINVAL; 139} 140 141static int skip_name_map(void **p, void *end) 142{ 143 int len; 144 ceph_decode_32_safe(p, end, len ,bad); 145 while (len--) { 146 int strlen; 147 *p += sizeof(u32); 148 ceph_decode_32_safe(p, end, strlen, bad); 149 *p += strlen; 150} 151 return 0; 152bad: 153 return -EINVAL; 154} 155 156static struct crush_map *crush_decode(void *pbyval, void *end) 157{ 158 struct crush_map *c; 159 int err = -EINVAL; 160 int i, j; 161 void **p = &pbyval; 162 void *start = pbyval; 163 u32 magic; 164 u32 num_name_maps; 165 166 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 167 168 c = kzalloc(sizeof(*c), GFP_NOFS); 169 if (c == NULL) 170 return ERR_PTR(-ENOMEM); 171 172 /* set tunables to default values */ 173 c->choose_local_tries = 2; 174 c->choose_local_fallback_tries = 5; 175 c->choose_total_tries = 19; 176 c->chooseleaf_descend_once = 0; 177 178 ceph_decode_need(p, end, 4*sizeof(u32), bad); 179 magic = ceph_decode_32(p); 180 if (magic != CRUSH_MAGIC) { 181 pr_err("crush_decode magic %x != current %x\n", 182 (unsigned int)magic, (unsigned int)CRUSH_MAGIC); 183 goto bad; 184 } 185 c->max_buckets = ceph_decode_32(p); 186 c->max_rules = ceph_decode_32(p); 187 c->max_devices = ceph_decode_32(p); 188 189 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); 190 if (c->buckets == NULL) 191 goto badmem; 192 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); 193 if (c->rules == NULL) 194 goto badmem; 195 196 /* buckets */ 197 for (i = 0; i < c->max_buckets; i++) { 198 int size = 0; 199 u32 alg; 200 struct crush_bucket *b; 201 202 ceph_decode_32_safe(p, end, alg, bad); 203 if (alg == 0) { 204 c->buckets[i] = NULL; 205 continue; 206 } 207 dout("crush_decode bucket %d off %x %p to %p\n", 208 i, (int)(*p-start), *p, end); 209 210 switch (alg) { 211 case CRUSH_BUCKET_UNIFORM: 212 size = sizeof(struct crush_bucket_uniform); 213 break; 214 case CRUSH_BUCKET_LIST: 215 size = sizeof(struct crush_bucket_list); 216 break; 217 case CRUSH_BUCKET_TREE: 218 size = sizeof(struct crush_bucket_tree); 219 break; 220 case CRUSH_BUCKET_STRAW: 221 size = sizeof(struct crush_bucket_straw); 222 break; 223 case CRUSH_BUCKET_STRAW2: 224 size = sizeof(struct crush_bucket_straw2); 225 break; 226 default: 227 err = -EINVAL; 228 goto bad; 229 } 230 BUG_ON(size == 0); 231 b = c->buckets[i] = kzalloc(size, GFP_NOFS); 232 if (b == NULL) 233 goto badmem; 234 235 ceph_decode_need(p, end, 4*sizeof(u32), bad); 236 b->id = ceph_decode_32(p); 237 b->type = ceph_decode_16(p); 238 b->alg = ceph_decode_8(p); 239 b->hash = ceph_decode_8(p); 240 b->weight = ceph_decode_32(p); 241 b->size = ceph_decode_32(p); 242 243 dout("crush_decode bucket size %d off %x %p to %p\n", 244 b->size, (int)(*p-start), *p, end); 245 246 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 247 if (b->items == NULL) 248 goto badmem; 249 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); 250 if (b->perm == NULL) 251 goto badmem; 252 b->perm_n = 0; 253 254 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 255 for (j = 0; j < b->size; j++) 256 b->items[j] = ceph_decode_32(p); 257 258 switch (b->alg) { 259 case CRUSH_BUCKET_UNIFORM: 260 err = crush_decode_uniform_bucket(p, end, 261 (struct crush_bucket_uniform *)b); 262 if (err < 0) 263 goto bad; 264 break; 265 case CRUSH_BUCKET_LIST: 266 err = crush_decode_list_bucket(p, end, 267 (struct crush_bucket_list *)b); 268 if (err < 0) 269 goto bad; 270 break; 271 case CRUSH_BUCKET_TREE: 272 err = crush_decode_tree_bucket(p, end, 273 (struct crush_bucket_tree *)b); 274 if (err < 0) 275 goto bad; 276 break; 277 case CRUSH_BUCKET_STRAW: 278 err = crush_decode_straw_bucket(p, end, 279 (struct crush_bucket_straw *)b); 280 if (err < 0) 281 goto bad; 282 break; 283 case CRUSH_BUCKET_STRAW2: 284 err = crush_decode_straw2_bucket(p, end, 285 (struct crush_bucket_straw2 *)b); 286 if (err < 0) 287 goto bad; 288 break; 289 } 290 } 291 292 /* rules */ 293 dout("rule vec is %p\n", c->rules); 294 for (i = 0; i < c->max_rules; i++) { 295 u32 yes; 296 struct crush_rule *r; 297 298 ceph_decode_32_safe(p, end, yes, bad); 299 if (!yes) { 300 dout("crush_decode NO rule %d off %x %p to %p\n", 301 i, (int)(*p-start), *p, end); 302 c->rules[i] = NULL; 303 continue; 304 } 305 306 dout("crush_decode rule %d off %x %p to %p\n", 307 i, (int)(*p-start), *p, end); 308 309 /* len */ 310 ceph_decode_32_safe(p, end, yes, bad); 311#if BITS_PER_LONG == 32 312 err = -EINVAL; 313 if (yes > (ULONG_MAX - sizeof(*r)) 314 / sizeof(struct crush_rule_step)) 315 goto bad; 316#endif 317 r = c->rules[i] = kmalloc(sizeof(*r) + 318 yes*sizeof(struct crush_rule_step), 319 GFP_NOFS); 320 if (r == NULL) 321 goto badmem; 322 dout(" rule %d is at %p\n", i, r); 323 r->len = yes; 324 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ 325 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); 326 for (j = 0; j < r->len; j++) { 327 r->steps[j].op = ceph_decode_32(p); 328 r->steps[j].arg1 = ceph_decode_32(p); 329 r->steps[j].arg2 = ceph_decode_32(p); 330 } 331 } 332 333 /* ignore trailing name maps. */ 334 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { 335 err = skip_name_map(p, end); 336 if (err < 0) 337 goto done; 338 } 339 340 /* tunables */ 341 ceph_decode_need(p, end, 3*sizeof(u32), done); 342 c->choose_local_tries = ceph_decode_32(p); 343 c->choose_local_fallback_tries = ceph_decode_32(p); 344 c->choose_total_tries = ceph_decode_32(p); 345 dout("crush decode tunable choose_local_tries = %d", 346 c->choose_local_tries); 347 dout("crush decode tunable choose_local_fallback_tries = %d", 348 c->choose_local_fallback_tries); 349 dout("crush decode tunable choose_total_tries = %d", 350 c->choose_total_tries); 351 352 ceph_decode_need(p, end, sizeof(u32), done); 353 c->chooseleaf_descend_once = ceph_decode_32(p); 354 dout("crush decode tunable chooseleaf_descend_once = %d", 355 c->chooseleaf_descend_once); 356 357 ceph_decode_need(p, end, sizeof(u8), done); 358 c->chooseleaf_vary_r = ceph_decode_8(p); 359 dout("crush decode tunable chooseleaf_vary_r = %d", 360 c->chooseleaf_vary_r); 361 362done: 363 dout("crush_decode success\n"); 364 return c; 365 366badmem: 367 err = -ENOMEM; 368bad: 369 dout("crush_decode fail %d\n", err); 370 crush_destroy(c); 371 return ERR_PTR(err); 372} 373 374/* 375 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 376 * to a set of osds) and primary_temp (explicit primary setting) 377 */ 378static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 379{ 380 if (l.pool < r.pool) 381 return -1; 382 if (l.pool > r.pool) 383 return 1; 384 if (l.seed < r.seed) 385 return -1; 386 if (l.seed > r.seed) 387 return 1; 388 return 0; 389} 390 391static int __insert_pg_mapping(struct ceph_pg_mapping *new, 392 struct rb_root *root) 393{ 394 struct rb_node **p = &root->rb_node; 395 struct rb_node *parent = NULL; 396 struct ceph_pg_mapping *pg = NULL; 397 int c; 398 399 dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); 400 while (*p) { 401 parent = *p; 402 pg = rb_entry(parent, struct ceph_pg_mapping, node); 403 c = pgid_cmp(new->pgid, pg->pgid); 404 if (c < 0) 405 p = &(*p)->rb_left; 406 else if (c > 0) 407 p = &(*p)->rb_right; 408 else 409 return -EEXIST; 410 } 411 412 rb_link_node(&new->node, parent, p); 413 rb_insert_color(&new->node, root); 414 return 0; 415} 416 417static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, 418 struct ceph_pg pgid) 419{ 420 struct rb_node *n = root->rb_node; 421 struct ceph_pg_mapping *pg; 422 int c; 423 424 while (n) { 425 pg = rb_entry(n, struct ceph_pg_mapping, node); 426 c = pgid_cmp(pgid, pg->pgid); 427 if (c < 0) { 428 n = n->rb_left; 429 } else if (c > 0) { 430 n = n->rb_right; 431 } else { 432 dout("__lookup_pg_mapping %lld.%x got %p\n", 433 pgid.pool, pgid.seed, pg); 434 return pg; 435 } 436 } 437 return NULL; 438} 439 440static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) 441{ 442 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); 443 444 if (pg) { 445 dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, 446 pg); 447 rb_erase(&pg->node, root); 448 kfree(pg); 449 return 0; 450 } 451 dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); 452 return -ENOENT; 453} 454 455/* 456 * rbtree of pg pool info 457 */ 458static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) 459{ 460 struct rb_node **p = &root->rb_node; 461 struct rb_node *parent = NULL; 462 struct ceph_pg_pool_info *pi = NULL; 463 464 while (*p) { 465 parent = *p; 466 pi = rb_entry(parent, struct ceph_pg_pool_info, node); 467 if (new->id < pi->id) 468 p = &(*p)->rb_left; 469 else if (new->id > pi->id) 470 p = &(*p)->rb_right; 471 else 472 return -EEXIST; 473 } 474 475 rb_link_node(&new->node, parent, p); 476 rb_insert_color(&new->node, root); 477 return 0; 478} 479 480static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) 481{ 482 struct ceph_pg_pool_info *pi; 483 struct rb_node *n = root->rb_node; 484 485 while (n) { 486 pi = rb_entry(n, struct ceph_pg_pool_info, node); 487 if (id < pi->id) 488 n = n->rb_left; 489 else if (id > pi->id) 490 n = n->rb_right; 491 else 492 return pi; 493 } 494 return NULL; 495} 496 497struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) 498{ 499 return __lookup_pg_pool(&map->pg_pools, id); 500} 501 502const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) 503{ 504 struct ceph_pg_pool_info *pi; 505 506 if (id == CEPH_NOPOOL) 507 return NULL; 508 509 if (WARN_ON_ONCE(id > (u64) INT_MAX)) 510 return NULL; 511 512 pi = __lookup_pg_pool(&map->pg_pools, (int) id); 513 514 return pi ? pi->name : NULL; 515} 516EXPORT_SYMBOL(ceph_pg_pool_name_by_id); 517 518int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) 519{ 520 struct rb_node *rbp; 521 522 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { 523 struct ceph_pg_pool_info *pi = 524 rb_entry(rbp, struct ceph_pg_pool_info, node); 525 if (pi->name && strcmp(pi->name, name) == 0) 526 return pi->id; 527 } 528 return -ENOENT; 529} 530EXPORT_SYMBOL(ceph_pg_poolid_by_name); 531 532static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 533{ 534 rb_erase(&pi->node, root); 535 kfree(pi->name); 536 kfree(pi); 537} 538 539static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 540{ 541 u8 ev, cv; 542 unsigned len, num; 543 void *pool_end; 544 545 ceph_decode_need(p, end, 2 + 4, bad); 546 ev = ceph_decode_8(p); /* encoding version */ 547 cv = ceph_decode_8(p); /* compat version */ 548 if (ev < 5) { 549 pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); 550 return -EINVAL; 551 } 552 if (cv > 9) { 553 pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); 554 return -EINVAL; 555 } 556 len = ceph_decode_32(p); 557 ceph_decode_need(p, end, len, bad); 558 pool_end = *p + len; 559 560 pi->type = ceph_decode_8(p); 561 pi->size = ceph_decode_8(p); 562 pi->crush_ruleset = ceph_decode_8(p); 563 pi->object_hash = ceph_decode_8(p); 564 565 pi->pg_num = ceph_decode_32(p); 566 pi->pgp_num = ceph_decode_32(p); 567 568 *p += 4 + 4; /* skip lpg* */ 569 *p += 4; /* skip last_change */ 570 *p += 8 + 4; /* skip snap_seq, snap_epoch */ 571 572 /* skip snaps */ 573 num = ceph_decode_32(p); 574 while (num--) { 575 *p += 8; /* snapid key */ 576 *p += 1 + 1; /* versions */ 577 len = ceph_decode_32(p); 578 *p += len; 579 } 580 581 /* skip removed_snaps */ 582 num = ceph_decode_32(p); 583 *p += num * (8 + 8); 584 585 *p += 8; /* skip auid */ 586 pi->flags = ceph_decode_64(p); 587 *p += 4; /* skip crash_replay_interval */ 588 589 if (ev >= 7) 590 *p += 1; /* skip min_size */ 591 592 if (ev >= 8) 593 *p += 8 + 8; /* skip quota_max_* */ 594 595 if (ev >= 9) { 596 /* skip tiers */ 597 num = ceph_decode_32(p); 598 *p += num * 8; 599 600 *p += 8; /* skip tier_of */ 601 *p += 1; /* skip cache_mode */ 602 603 pi->read_tier = ceph_decode_64(p); 604 pi->write_tier = ceph_decode_64(p); 605 } else { 606 pi->read_tier = -1; 607 pi->write_tier = -1; 608 } 609 610 /* ignore the rest */ 611 612 *p = pool_end; 613 calc_pg_masks(pi); 614 return 0; 615 616bad: 617 return -EINVAL; 618} 619 620static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 621{ 622 struct ceph_pg_pool_info *pi; 623 u32 num, len; 624 u64 pool; 625 626 ceph_decode_32_safe(p, end, num, bad); 627 dout(" %d pool names\n", num); 628 while (num--) { 629 ceph_decode_64_safe(p, end, pool, bad); 630 ceph_decode_32_safe(p, end, len, bad); 631 dout(" pool %llu len %d\n", pool, len); 632 ceph_decode_need(p, end, len, bad); 633 pi = __lookup_pg_pool(&map->pg_pools, pool); 634 if (pi) { 635 char *name = kstrndup(*p, len, GFP_NOFS); 636 637 if (!name) 638 return -ENOMEM; 639 kfree(pi->name); 640 pi->name = name; 641 dout(" name is %s\n", pi->name); 642 } 643 *p += len; 644 } 645 return 0; 646 647bad: 648 return -EINVAL; 649} 650 651/* 652 * osd map 653 */ 654void ceph_osdmap_destroy(struct ceph_osdmap *map) 655{ 656 dout("osdmap_destroy %p\n", map); 657 if (map->crush) 658 crush_destroy(map->crush); 659 while (!RB_EMPTY_ROOT(&map->pg_temp)) { 660 struct ceph_pg_mapping *pg = 661 rb_entry(rb_first(&map->pg_temp), 662 struct ceph_pg_mapping, node); 663 rb_erase(&pg->node, &map->pg_temp); 664 kfree(pg); 665 } 666 while (!RB_EMPTY_ROOT(&map->primary_temp)) { 667 struct ceph_pg_mapping *pg = 668 rb_entry(rb_first(&map->primary_temp), 669 struct ceph_pg_mapping, node); 670 rb_erase(&pg->node, &map->primary_temp); 671 kfree(pg); 672 } 673 while (!RB_EMPTY_ROOT(&map->pg_pools)) { 674 struct ceph_pg_pool_info *pi = 675 rb_entry(rb_first(&map->pg_pools), 676 struct ceph_pg_pool_info, node); 677 __remove_pg_pool(&map->pg_pools, pi); 678 } 679 kfree(map->osd_state); 680 kfree(map->osd_weight); 681 kfree(map->osd_addr); 682 kfree(map->osd_primary_affinity); 683 kfree(map); 684} 685 686/* 687 * Adjust max_osd value, (re)allocate arrays. 688 * 689 * The new elements are properly initialized. 690 */ 691static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 692{ 693 u8 *state; 694 u32 *weight; 695 struct ceph_entity_addr *addr; 696 int i; 697 698 state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 699 if (!state) 700 return -ENOMEM; 701 map->osd_state = state; 702 703 weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 704 if (!weight) 705 return -ENOMEM; 706 map->osd_weight = weight; 707 708 addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 709 if (!addr) 710 return -ENOMEM; 711 map->osd_addr = addr; 712 713 for (i = map->max_osd; i < max; i++) { 714 map->osd_state[i] = 0; 715 map->osd_weight[i] = CEPH_OSD_OUT; 716 memset(map->osd_addr + i, 0, sizeof(*map->osd_addr)); 717 } 718 719 if (map->osd_primary_affinity) { 720 u32 *affinity; 721 722 affinity = krealloc(map->osd_primary_affinity, 723 max*sizeof(*affinity), GFP_NOFS); 724 if (!affinity) 725 return -ENOMEM; 726 map->osd_primary_affinity = affinity; 727 728 for (i = map->max_osd; i < max; i++) 729 map->osd_primary_affinity[i] = 730 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 731 } 732 733 map->max_osd = max; 734 735 return 0; 736} 737 738#define OSDMAP_WRAPPER_COMPAT_VER 7 739#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 740 741/* 742 * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, 743 * to struct_v of the client_data section for new (v7 and above) 744 * osdmaps. 745 */ 746static int get_osdmap_client_data_v(void **p, void *end, 747 const char *prefix, u8 *v) 748{ 749 u8 struct_v; 750 751 ceph_decode_8_safe(p, end, struct_v, e_inval); 752 if (struct_v >= 7) { 753 u8 struct_compat; 754 755 ceph_decode_8_safe(p, end, struct_compat, e_inval); 756 if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { 757 pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n", 758 struct_v, struct_compat, 759 OSDMAP_WRAPPER_COMPAT_VER, prefix); 760 return -EINVAL; 761 } 762 *p += 4; /* ignore wrapper struct_len */ 763 764 ceph_decode_8_safe(p, end, struct_v, e_inval); 765 ceph_decode_8_safe(p, end, struct_compat, e_inval); 766 if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { 767 pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n", 768 struct_v, struct_compat, 769 OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); 770 return -EINVAL; 771 } 772 *p += 4; /* ignore client data struct_len */ 773 } else { 774 u16 version; 775 776 *p -= 1; 777 ceph_decode_16_safe(p, end, version, e_inval); 778 if (version < 6) { 779 pr_warn("got v %d < 6 of %s ceph_osdmap\n", 780 version, prefix); 781 return -EINVAL; 782 } 783 784 /* old osdmap enconding */ 785 struct_v = 0; 786 } 787 788 *v = struct_v; 789 return 0; 790 791e_inval: 792 return -EINVAL; 793} 794 795static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, 796 bool incremental) 797{ 798 u32 n; 799 800 ceph_decode_32_safe(p, end, n, e_inval); 801 while (n--) { 802 struct ceph_pg_pool_info *pi; 803 u64 pool; 804 int ret; 805 806 ceph_decode_64_safe(p, end, pool, e_inval); 807 808 pi = __lookup_pg_pool(&map->pg_pools, pool); 809 if (!incremental || !pi) { 810 pi = kzalloc(sizeof(*pi), GFP_NOFS); 811 if (!pi) 812 return -ENOMEM; 813 814 pi->id = pool; 815 816 ret = __insert_pg_pool(&map->pg_pools, pi); 817 if (ret) { 818 kfree(pi); 819 return ret; 820 } 821 } 822 823 ret = decode_pool(p, end, pi); 824 if (ret) 825 return ret; 826 } 827 828 return 0; 829 830e_inval: 831 return -EINVAL; 832} 833 834static int decode_pools(void **p, void *end, struct ceph_osdmap *map) 835{ 836 return __decode_pools(p, end, map, false); 837} 838 839static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) 840{ 841 return __decode_pools(p, end, map, true); 842} 843 844static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, 845 bool incremental) 846{ 847 u32 n; 848 849 ceph_decode_32_safe(p, end, n, e_inval); 850 while (n--) { 851 struct ceph_pg pgid; 852 u32 len, i; 853 int ret; 854 855 ret = ceph_decode_pgid(p, end, &pgid); 856 if (ret) 857 return ret; 858 859 ceph_decode_32_safe(p, end, len, e_inval); 860 861 ret = __remove_pg_mapping(&map->pg_temp, pgid); 862 BUG_ON(!incremental && ret != -ENOENT); 863 864 if (!incremental || len > 0) { 865 struct ceph_pg_mapping *pg; 866 867 ceph_decode_need(p, end, len*sizeof(u32), e_inval); 868 869 if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 870 return -EINVAL; 871 872 pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); 873 if (!pg) 874 return -ENOMEM; 875 876 pg->pgid = pgid; 877 pg->pg_temp.len = len; 878 for (i = 0; i < len; i++) 879 pg->pg_temp.osds[i] = ceph_decode_32(p); 880 881 ret = __insert_pg_mapping(pg, &map->pg_temp); 882 if (ret) { 883 kfree(pg); 884 return ret; 885 } 886 } 887 } 888 889 return 0; 890 891e_inval: 892 return -EINVAL; 893} 894 895static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) 896{ 897 return __decode_pg_temp(p, end, map, false); 898} 899 900static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) 901{ 902 return __decode_pg_temp(p, end, map, true); 903} 904 905static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, 906 bool incremental) 907{ 908 u32 n; 909 910 ceph_decode_32_safe(p, end, n, e_inval); 911 while (n--) { 912 struct ceph_pg pgid; 913 u32 osd; 914 int ret; 915 916 ret = ceph_decode_pgid(p, end, &pgid); 917 if (ret) 918 return ret; 919 920 ceph_decode_32_safe(p, end, osd, e_inval); 921 922 ret = __remove_pg_mapping(&map->primary_temp, pgid); 923 BUG_ON(!incremental && ret != -ENOENT); 924 925 if (!incremental || osd != (u32)-1) { 926 struct ceph_pg_mapping *pg; 927 928 pg = kzalloc(sizeof(*pg), GFP_NOFS); 929 if (!pg) 930 return -ENOMEM; 931 932 pg->pgid = pgid; 933 pg->primary_temp.osd = osd; 934 935 ret = __insert_pg_mapping(pg, &map->primary_temp); 936 if (ret) { 937 kfree(pg); 938 return ret; 939 } 940 } 941 } 942 943 return 0; 944 945e_inval: 946 return -EINVAL; 947} 948 949static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) 950{ 951 return __decode_primary_temp(p, end, map, false); 952} 953 954static int decode_new_primary_temp(void **p, void *end, 955 struct ceph_osdmap *map) 956{ 957 return __decode_primary_temp(p, end, map, true); 958} 959 960u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) 961{ 962 BUG_ON(osd >= map->max_osd); 963 964 if (!map->osd_primary_affinity) 965 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 966 967 return map->osd_primary_affinity[osd]; 968} 969 970static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) 971{ 972 BUG_ON(osd >= map->max_osd); 973 974 if (!map->osd_primary_affinity) { 975 int i; 976 977 map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), 978 GFP_NOFS); 979 if (!map->osd_primary_affinity) 980 return -ENOMEM; 981 982 for (i = 0; i < map->max_osd; i++) 983 map->osd_primary_affinity[i] = 984 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 985 } 986 987 map->osd_primary_affinity[osd] = aff; 988 989 return 0; 990} 991 992static int decode_primary_affinity(void **p, void *end, 993 struct ceph_osdmap *map) 994{ 995 u32 len, i; 996 997 ceph_decode_32_safe(p, end, len, e_inval); 998 if (len == 0) { 999 kfree(map->osd_primary_affinity); 1000 map->osd_primary_affinity = NULL; 1001 return 0; 1002 } 1003 if (len != map->max_osd) 1004 goto e_inval; 1005 1006 ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); 1007 1008 for (i = 0; i < map->max_osd; i++) { 1009 int ret; 1010 1011 ret = set_primary_affinity(map, i, ceph_decode_32(p)); 1012 if (ret) 1013 return ret; 1014 } 1015 1016 return 0; 1017 1018e_inval: 1019 return -EINVAL; 1020} 1021 1022static int decode_new_primary_affinity(void **p, void *end, 1023 struct ceph_osdmap *map) 1024{ 1025 u32 n; 1026 1027 ceph_decode_32_safe(p, end, n, e_inval); 1028 while (n--) { 1029 u32 osd, aff; 1030 int ret; 1031 1032 ceph_decode_32_safe(p, end, osd, e_inval); 1033 ceph_decode_32_safe(p, end, aff, e_inval); 1034 1035 ret = set_primary_affinity(map, osd, aff); 1036 if (ret) 1037 return ret; 1038 1039 pr_info("osd%d primary-affinity 0x%x\n", osd, aff); 1040 } 1041 1042 return 0; 1043 1044e_inval: 1045 return -EINVAL; 1046} 1047 1048/* 1049 * decode a full map. 1050 */ 1051static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) 1052{ 1053 u8 struct_v; 1054 u32 epoch = 0; 1055 void *start = *p; 1056 u32 max; 1057 u32 len, i; 1058 int err; 1059 1060 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 1061 1062 err = get_osdmap_client_data_v(p, end, "full", &struct_v); 1063 if (err) 1064 goto bad; 1065 1066 /* fsid, epoch, created, modified */ 1067 ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + 1068 sizeof(map->created) + sizeof(map->modified), e_inval); 1069 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); 1070 epoch = map->epoch = ceph_decode_32(p); 1071 ceph_decode_copy(p, &map->created, sizeof(map->created)); 1072 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 1073 1074 /* pools */ 1075 err = decode_pools(p, end, map); 1076 if (err) 1077 goto bad; 1078 1079 /* pool_name */ 1080 err = decode_pool_names(p, end, map); 1081 if (err) 1082 goto bad; 1083 1084 ceph_decode_32_safe(p, end, map->pool_max, e_inval); 1085 1086 ceph_decode_32_safe(p, end, map->flags, e_inval); 1087 1088 /* max_osd */ 1089 ceph_decode_32_safe(p, end, max, e_inval); 1090 1091 /* (re)alloc osd arrays */ 1092 err = osdmap_set_max_osd(map, max); 1093 if (err) 1094 goto bad; 1095 1096 /* osd_state, osd_weight, osd_addrs->client_addr */ 1097 ceph_decode_need(p, end, 3*sizeof(u32) + 1098 map->max_osd*(1 + sizeof(*map->osd_weight) + 1099 sizeof(*map->osd_addr)), e_inval); 1100 1101 if (ceph_decode_32(p) != map->max_osd) 1102 goto e_inval; 1103 1104 ceph_decode_copy(p, map->osd_state, map->max_osd); 1105 1106 if (ceph_decode_32(p) != map->max_osd) 1107 goto e_inval; 1108 1109 for (i = 0; i < map->max_osd; i++) 1110 map->osd_weight[i] = ceph_decode_32(p); 1111 1112 if (ceph_decode_32(p) != map->max_osd) 1113 goto e_inval; 1114 1115 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 1116 for (i = 0; i < map->max_osd; i++) 1117 ceph_decode_addr(&map->osd_addr[i]); 1118 1119 /* pg_temp */ 1120 err = decode_pg_temp(p, end, map); 1121 if (err) 1122 goto bad; 1123 1124 /* primary_temp */ 1125 if (struct_v >= 1) { 1126 err = decode_primary_temp(p, end, map); 1127 if (err) 1128 goto bad; 1129 } 1130 1131 /* primary_affinity */ 1132 if (struct_v >= 2) { 1133 err = decode_primary_affinity(p, end, map); 1134 if (err) 1135 goto bad; 1136 } else { 1137 /* XXX can this happen? */ 1138 kfree(map->osd_primary_affinity); 1139 map->osd_primary_affinity = NULL; 1140 } 1141 1142 /* crush */ 1143 ceph_decode_32_safe(p, end, len, e_inval); 1144 map->crush = crush_decode(*p, min(*p + len, end)); 1145 if (IS_ERR(map->crush)) { 1146 err = PTR_ERR(map->crush); 1147 map->crush = NULL; 1148 goto bad; 1149 } 1150 *p += len; 1151 1152 /* ignore the rest */ 1153 *p = end; 1154 1155 dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1156 return 0; 1157 1158e_inval: 1159 err = -EINVAL; 1160bad: 1161 pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1162 err, epoch, (int)(*p - start), *p, start, end); 1163 print_hex_dump(KERN_DEBUG, "osdmap: ", 1164 DUMP_PREFIX_OFFSET, 16, 1, 1165 start, end - start, true); 1166 return err; 1167} 1168 1169/* 1170 * Allocate and decode a full map. 1171 */ 1172struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) 1173{ 1174 struct ceph_osdmap *map; 1175 int ret; 1176 1177 map = kzalloc(sizeof(*map), GFP_NOFS); 1178 if (!map) 1179 return ERR_PTR(-ENOMEM); 1180 1181 map->pg_temp = RB_ROOT; 1182 map->primary_temp = RB_ROOT; 1183 mutex_init(&map->crush_scratch_mutex); 1184 1185 ret = osdmap_decode(p, end, map); 1186 if (ret) { 1187 ceph_osdmap_destroy(map); 1188 return ERR_PTR(ret); 1189 } 1190 1191 return map; 1192} 1193 1194/* 1195 * decode and apply an incremental map update. 1196 */ 1197struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1198 struct ceph_osdmap *map, 1199 struct ceph_messenger *msgr) 1200{ 1201 struct crush_map *newcrush = NULL; 1202 struct ceph_fsid fsid; 1203 u32 epoch = 0; 1204 struct ceph_timespec modified; 1205 s32 len; 1206 u64 pool; 1207 __s64 new_pool_max; 1208 __s32 new_flags, max; 1209 void *start = *p; 1210 int err; 1211 u8 struct_v; 1212 1213 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 1214 1215 err = get_osdmap_client_data_v(p, end, "inc", &struct_v); 1216 if (err) 1217 goto bad; 1218 1219 /* fsid, epoch, modified, new_pool_max, new_flags */ 1220 ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + 1221 sizeof(u64) + sizeof(u32), e_inval); 1222 ceph_decode_copy(p, &fsid, sizeof(fsid)); 1223 epoch = ceph_decode_32(p); 1224 BUG_ON(epoch != map->epoch+1); 1225 ceph_decode_copy(p, &modified, sizeof(modified)); 1226 new_pool_max = ceph_decode_64(p); 1227 new_flags = ceph_decode_32(p); 1228 1229 /* full map? */ 1230 ceph_decode_32_safe(p, end, len, e_inval); 1231 if (len > 0) { 1232 dout("apply_incremental full map len %d, %p to %p\n", 1233 len, *p, end); 1234 return ceph_osdmap_decode(p, min(*p+len, end)); 1235 } 1236 1237 /* new crush? */ 1238 ceph_decode_32_safe(p, end, len, e_inval); 1239 if (len > 0) { 1240 newcrush = crush_decode(*p, min(*p+len, end)); 1241 if (IS_ERR(newcrush)) { 1242 err = PTR_ERR(newcrush); 1243 newcrush = NULL; 1244 goto bad; 1245 } 1246 *p += len; 1247 } 1248 1249 /* new flags? */ 1250 if (new_flags >= 0) 1251 map->flags = new_flags; 1252 if (new_pool_max >= 0) 1253 map->pool_max = new_pool_max; 1254 1255 /* new max? */ 1256 ceph_decode_32_safe(p, end, max, e_inval); 1257 if (max >= 0) { 1258 err = osdmap_set_max_osd(map, max); 1259 if (err) 1260 goto bad; 1261 } 1262 1263 map->epoch++; 1264 map->modified = modified; 1265 if (newcrush) { 1266 if (map->crush) 1267 crush_destroy(map->crush); 1268 map->crush = newcrush; 1269 newcrush = NULL; 1270 } 1271 1272 /* new_pools */ 1273 err = decode_new_pools(p, end, map); 1274 if (err) 1275 goto bad; 1276 1277 /* new_pool_names */ 1278 err = decode_pool_names(p, end, map); 1279 if (err) 1280 goto bad; 1281 1282 /* old_pool */ 1283 ceph_decode_32_safe(p, end, len, e_inval); 1284 while (len--) { 1285 struct ceph_pg_pool_info *pi; 1286 1287 ceph_decode_64_safe(p, end, pool, e_inval); 1288 pi = __lookup_pg_pool(&map->pg_pools, pool); 1289 if (pi) 1290 __remove_pg_pool(&map->pg_pools, pi); 1291 } 1292 1293 /* new_up */ 1294 ceph_decode_32_safe(p, end, len, e_inval); 1295 while (len--) { 1296 u32 osd; 1297 struct ceph_entity_addr addr; 1298 ceph_decode_32_safe(p, end, osd, e_inval); 1299 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); 1300 ceph_decode_addr(&addr); 1301 pr_info("osd%d up\n", osd); 1302 BUG_ON(osd >= map->max_osd); 1303 map->osd_state[osd] |= CEPH_OSD_UP; 1304 map->osd_addr[osd] = addr; 1305 } 1306 1307 /* new_state */ 1308 ceph_decode_32_safe(p, end, len, e_inval); 1309 while (len--) { 1310 u32 osd; 1311 u8 xorstate; 1312 ceph_decode_32_safe(p, end, osd, e_inval); 1313 xorstate = **(u8 **)p; 1314 (*p)++; /* clean flag */ 1315 if (xorstate == 0) 1316 xorstate = CEPH_OSD_UP; 1317 if (xorstate & CEPH_OSD_UP) 1318 pr_info("osd%d down\n", osd); 1319 if (osd < map->max_osd) 1320 map->osd_state[osd] ^= xorstate; 1321 } 1322 1323 /* new_weight */ 1324 ceph_decode_32_safe(p, end, len, e_inval); 1325 while (len--) { 1326 u32 osd, off; 1327 ceph_decode_need(p, end, sizeof(u32)*2, e_inval); 1328 osd = ceph_decode_32(p); 1329 off = ceph_decode_32(p); 1330 pr_info("osd%d weight 0x%x %s\n", osd, off, 1331 off == CEPH_OSD_IN ? "(in)" : 1332 (off == CEPH_OSD_OUT ? "(out)" : "")); 1333 if (osd < map->max_osd) 1334 map->osd_weight[osd] = off; 1335 } 1336 1337 /* new_pg_temp */ 1338 err = decode_new_pg_temp(p, end, map); 1339 if (err) 1340 goto bad; 1341 1342 /* new_primary_temp */ 1343 if (struct_v >= 1) { 1344 err = decode_new_primary_temp(p, end, map); 1345 if (err) 1346 goto bad; 1347 } 1348 1349 /* new_primary_affinity */ 1350 if (struct_v >= 2) { 1351 err = decode_new_primary_affinity(p, end, map); 1352 if (err) 1353 goto bad; 1354 } 1355 1356 /* ignore the rest */ 1357 *p = end; 1358 1359 dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1360 return map; 1361 1362e_inval: 1363 err = -EINVAL; 1364bad: 1365 pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1366 err, epoch, (int)(*p - start), *p, start, end); 1367 print_hex_dump(KERN_DEBUG, "osdmap: ", 1368 DUMP_PREFIX_OFFSET, 16, 1, 1369 start, end - start, true); 1370 if (newcrush) 1371 crush_destroy(newcrush); 1372 return ERR_PTR(err); 1373} 1374 1375 1376 1377 1378/* 1379 * calculate file layout from given offset, length. 1380 * fill in correct oid, logical length, and object extent 1381 * offset, length. 1382 * 1383 * for now, we write only a single su, until we can 1384 * pass a stride back to the caller. 1385 */ 1386int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 1387 u64 off, u64 len, 1388 u64 *ono, 1389 u64 *oxoff, u64 *oxlen) 1390{ 1391 u32 osize = le32_to_cpu(layout->fl_object_size); 1392 u32 su = le32_to_cpu(layout->fl_stripe_unit); 1393 u32 sc = le32_to_cpu(layout->fl_stripe_count); 1394 u32 bl, stripeno, stripepos, objsetno; 1395 u32 su_per_object; 1396 u64 t, su_offset; 1397 1398 dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, 1399 osize, su); 1400 if (su == 0 || sc == 0) 1401 goto invalid; 1402 su_per_object = osize / su; 1403 if (su_per_object == 0) 1404 goto invalid; 1405 dout("osize %u / su %u = su_per_object %u\n", osize, su, 1406 su_per_object); 1407 1408 if ((su & ~PAGE_MASK) != 0) 1409 goto invalid; 1410 1411 /* bl = *off / su; */ 1412 t = off; 1413 do_div(t, su); 1414 bl = t; 1415 dout("off %llu / su %u = bl %u\n", off, su, bl); 1416 1417 stripeno = bl / sc; 1418 stripepos = bl % sc; 1419 objsetno = stripeno / su_per_object; 1420 1421 *ono = objsetno * sc + stripepos; 1422 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); 1423 1424 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ 1425 t = off; 1426 su_offset = do_div(t, su); 1427 *oxoff = su_offset + (stripeno % su_per_object) * su; 1428 1429 /* 1430 * Calculate the length of the extent being written to the selected 1431 * object. This is the minimum of the full length requested (len) or 1432 * the remainder of the current stripe being written to. 1433 */ 1434 *oxlen = min_t(u64, len, su - su_offset); 1435 1436 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1437 return 0; 1438 1439invalid: 1440 dout(" invalid layout\n"); 1441 *ono = 0; 1442 *oxoff = 0; 1443 *oxlen = 0; 1444 return -EINVAL; 1445} 1446EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1447 1448/* 1449 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1450 * called with target's (oloc, oid), since tiering isn't taken into 1451 * account. 1452 */ 1453int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1454 struct ceph_object_locator *oloc, 1455 struct ceph_object_id *oid, 1456 struct ceph_pg *pg_out) 1457{ 1458 struct ceph_pg_pool_info *pi; 1459 1460 pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1461 if (!pi) 1462 return -EIO; 1463 1464 pg_out->pool = oloc->pool; 1465 pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1466 oid->name_len); 1467 1468 dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1469 pg_out->pool, pg_out->seed); 1470 return 0; 1471} 1472EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1473 1474static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 1475 int *result, int result_max, 1476 const __u32 *weight, int weight_max) 1477{ 1478 int r; 1479 1480 BUG_ON(result_max > CEPH_PG_MAX_SIZE); 1481 1482 mutex_lock(&map->crush_scratch_mutex); 1483 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1484 weight, weight_max, map->crush_scratch_ary); 1485 mutex_unlock(&map->crush_scratch_mutex); 1486 1487 return r; 1488} 1489 1490/* 1491 * Calculate raw (crush) set for given pgid. 1492 * 1493 * Return raw set length, or error. 1494 */ 1495static int pg_to_raw_osds(struct ceph_osdmap *osdmap, 1496 struct ceph_pg_pool_info *pool, 1497 struct ceph_pg pgid, u32 pps, int *osds) 1498{ 1499 int ruleno; 1500 int len; 1501 1502 /* crush */ 1503 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1504 pool->type, pool->size); 1505 if (ruleno < 0) { 1506 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", 1507 pgid.pool, pool->crush_ruleset, pool->type, 1508 pool->size); 1509 return -ENOENT; 1510 } 1511 1512 len = do_crush(osdmap, ruleno, pps, osds, 1513 min_t(int, pool->size, CEPH_PG_MAX_SIZE), 1514 osdmap->osd_weight, osdmap->max_osd); 1515 if (len < 0) { 1516 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 1517 len, ruleno, pgid.pool, pool->crush_ruleset, 1518 pool->type, pool->size); 1519 return len; 1520 } 1521 1522 return len; 1523} 1524 1525/* 1526 * Given raw set, calculate up set and up primary. 1527 * 1528 * Return up set length. *primary is set to up primary osd id, or -1 1529 * if up set is empty. 1530 */ 1531static int raw_to_up_osds(struct ceph_osdmap *osdmap, 1532 struct ceph_pg_pool_info *pool, 1533 int *osds, int len, int *primary) 1534{ 1535 int up_primary = -1; 1536 int i; 1537 1538 if (ceph_can_shift_osds(pool)) { 1539 int removed = 0; 1540 1541 for (i = 0; i < len; i++) { 1542 if (ceph_osd_is_down(osdmap, osds[i])) { 1543 removed++; 1544 continue; 1545 } 1546 if (removed) 1547 osds[i - removed] = osds[i]; 1548 } 1549 1550 len -= removed; 1551 if (len > 0) 1552 up_primary = osds[0]; 1553 } else { 1554 for (i = len - 1; i >= 0; i--) { 1555 if (ceph_osd_is_down(osdmap, osds[i])) 1556 osds[i] = CRUSH_ITEM_NONE; 1557 else 1558 up_primary = osds[i]; 1559 } 1560 } 1561 1562 *primary = up_primary; 1563 return len; 1564} 1565 1566static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, 1567 struct ceph_pg_pool_info *pool, 1568 int *osds, int len, int *primary) 1569{ 1570 int i; 1571 int pos = -1; 1572 1573 /* 1574 * Do we have any non-default primary_affinity values for these 1575 * osds? 1576 */ 1577 if (!osdmap->osd_primary_affinity) 1578 return; 1579 1580 for (i = 0; i < len; i++) { 1581 int osd = osds[i]; 1582 1583 if (osd != CRUSH_ITEM_NONE && 1584 osdmap->osd_primary_affinity[osd] != 1585 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { 1586 break; 1587 } 1588 } 1589 if (i == len) 1590 return; 1591 1592 /* 1593 * Pick the primary. Feed both the seed (for the pg) and the 1594 * osd into the hash/rng so that a proportional fraction of an 1595 * osd's pgs get rejected as primary. 1596 */ 1597 for (i = 0; i < len; i++) { 1598 int osd = osds[i]; 1599 u32 aff; 1600 1601 if (osd == CRUSH_ITEM_NONE) 1602 continue; 1603 1604 aff = osdmap->osd_primary_affinity[osd]; 1605 if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && 1606 (crush_hash32_2(CRUSH_HASH_RJENKINS1, 1607 pps, osd) >> 16) >= aff) { 1608 /* 1609 * We chose not to use this primary. Note it 1610 * anyway as a fallback in case we don't pick 1611 * anyone else, but keep looking. 1612 */ 1613 if (pos < 0) 1614 pos = i; 1615 } else { 1616 pos = i; 1617 break; 1618 } 1619 } 1620 if (pos < 0) 1621 return; 1622 1623 *primary = osds[pos]; 1624 1625 if (ceph_can_shift_osds(pool) && pos > 0) { 1626 /* move the new primary to the front */ 1627 for (i = pos; i > 0; i--) 1628 osds[i] = osds[i - 1]; 1629 osds[0] = *primary; 1630 } 1631} 1632 1633/* 1634 * Given up set, apply pg_temp and primary_temp mappings. 1635 * 1636 * Return acting set length. *primary is set to acting primary osd id, 1637 * or -1 if acting set is empty. 1638 */ 1639static int apply_temps(struct ceph_osdmap *osdmap, 1640 struct ceph_pg_pool_info *pool, struct ceph_pg pgid, 1641 int *osds, int len, int *primary) 1642{ 1643 struct ceph_pg_mapping *pg; 1644 int temp_len; 1645 int temp_primary; 1646 int i; 1647 1648 /* raw_pg -> pg */ 1649 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1650 pool->pg_num_mask); 1651 1652 /* pg_temp? */ 1653 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1654 if (pg) { 1655 temp_len = 0; 1656 temp_primary = -1; 1657 1658 for (i = 0; i < pg->pg_temp.len; i++) { 1659 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 1660 if (ceph_can_shift_osds(pool)) 1661 continue; 1662 else 1663 osds[temp_len++] = CRUSH_ITEM_NONE; 1664 } else { 1665 osds[temp_len++] = pg->pg_temp.osds[i]; 1666 } 1667 } 1668 1669 /* apply pg_temp's primary */ 1670 for (i = 0; i < temp_len; i++) { 1671 if (osds[i] != CRUSH_ITEM_NONE) { 1672 temp_primary = osds[i]; 1673 break; 1674 } 1675 } 1676 } else { 1677 temp_len = len; 1678 temp_primary = *primary; 1679 } 1680 1681 /* primary_temp? */ 1682 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 1683 if (pg) 1684 temp_primary = pg->primary_temp.osd; 1685 1686 *primary = temp_primary; 1687 return temp_len; 1688} 1689 1690/* 1691 * Calculate acting set for given pgid. 1692 * 1693 * Return acting set length, or error. *primary is set to acting 1694 * primary osd id, or -1 if acting set is empty or on error. 1695 */ 1696int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1697 int *osds, int *primary) 1698{ 1699 struct ceph_pg_pool_info *pool; 1700 u32 pps; 1701 int len; 1702 1703 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1704 if (!pool) { 1705 *primary = -1; 1706 return -ENOENT; 1707 } 1708 1709 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 1710 /* hash pool id and seed so that pool PGs do not overlap */ 1711 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 1712 ceph_stable_mod(pgid.seed, pool->pgp_num, 1713 pool->pgp_num_mask), 1714 pgid.pool); 1715 } else { 1716 /* 1717 * legacy behavior: add ps and pool together. this is 1718 * not a great approach because the PGs from each pool 1719 * will overlap on top of each other: 0.5 == 1.4 == 1720 * 2.3 == ... 1721 */ 1722 pps = ceph_stable_mod(pgid.seed, pool->pgp_num, 1723 pool->pgp_num_mask) + 1724 (unsigned)pgid.pool; 1725 } 1726 1727 len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); 1728 if (len < 0) { 1729 *primary = -1; 1730 return len; 1731 } 1732 1733 len = raw_to_up_osds(osdmap, pool, osds, len, primary); 1734 1735 apply_primary_affinity(osdmap, pps, pool, osds, len, primary); 1736 1737 len = apply_temps(osdmap, pool, pgid, osds, len, primary); 1738 1739 return len; 1740} 1741 1742/* 1743 * Return primary osd for given pgid, or -1 if none. 1744 */ 1745int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1746{ 1747 int osds[CEPH_PG_MAX_SIZE]; 1748 int primary; 1749 1750 ceph_calc_pg_acting(osdmap, pgid, osds, &primary); 1751 1752 return primary; 1753} 1754EXPORT_SYMBOL(ceph_calc_pg_primary); 1755