1/* 2 * Copyright (c) 2014 Christoph Hellwig. 3 */ 4 5#include <linux/vmalloc.h> 6 7#include "blocklayout.h" 8 9#define NFSDBG_FACILITY NFSDBG_PNFS_LD 10 11static inline struct pnfs_block_extent * 12ext_node(struct rb_node *node) 13{ 14 return rb_entry(node, struct pnfs_block_extent, be_node); 15} 16 17static struct pnfs_block_extent * 18ext_tree_first(struct rb_root *root) 19{ 20 struct rb_node *node = rb_first(root); 21 return node ? ext_node(node) : NULL; 22} 23 24static struct pnfs_block_extent * 25ext_tree_prev(struct pnfs_block_extent *be) 26{ 27 struct rb_node *node = rb_prev(&be->be_node); 28 return node ? ext_node(node) : NULL; 29} 30 31static struct pnfs_block_extent * 32ext_tree_next(struct pnfs_block_extent *be) 33{ 34 struct rb_node *node = rb_next(&be->be_node); 35 return node ? ext_node(node) : NULL; 36} 37 38static inline sector_t 39ext_f_end(struct pnfs_block_extent *be) 40{ 41 return be->be_f_offset + be->be_length; 42} 43 44static struct pnfs_block_extent * 45__ext_tree_search(struct rb_root *root, sector_t start) 46{ 47 struct rb_node *node = root->rb_node; 48 struct pnfs_block_extent *be = NULL; 49 50 while (node) { 51 be = ext_node(node); 52 if (start < be->be_f_offset) 53 node = node->rb_left; 54 else if (start >= ext_f_end(be)) 55 node = node->rb_right; 56 else 57 return be; 58 } 59 60 if (be) { 61 if (start < be->be_f_offset) 62 return be; 63 64 if (start >= ext_f_end(be)) 65 return ext_tree_next(be); 66 } 67 68 return NULL; 69} 70 71static bool 72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) 73{ 74 if (be1->be_state != be2->be_state) 75 return false; 76 if (be1->be_device != be2->be_device) 77 return false; 78 79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset) 80 return false; 81 82 if (be1->be_state != PNFS_BLOCK_NONE_DATA && 83 (be1->be_v_offset + be1->be_length != be2->be_v_offset)) 84 return false; 85 86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA && 87 be1->be_tag != be2->be_tag) 88 return false; 89 90 return true; 91} 92 93static struct pnfs_block_extent * 94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) 95{ 96 struct pnfs_block_extent *left = ext_tree_prev(be); 97 98 if (left && ext_can_merge(left, be)) { 99 left->be_length += be->be_length; 100 rb_erase(&be->be_node, root); 101 nfs4_put_deviceid_node(be->be_device); 102 kfree(be); 103 return left; 104 } 105 106 return be; 107} 108 109static struct pnfs_block_extent * 110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) 111{ 112 struct pnfs_block_extent *right = ext_tree_next(be); 113 114 if (right && ext_can_merge(be, right)) { 115 be->be_length += right->be_length; 116 rb_erase(&right->be_node, root); 117 nfs4_put_deviceid_node(right->be_device); 118 kfree(right); 119 } 120 121 return be; 122} 123 124static void 125__ext_tree_insert(struct rb_root *root, 126 struct pnfs_block_extent *new, bool merge_ok) 127{ 128 struct rb_node **p = &root->rb_node, *parent = NULL; 129 struct pnfs_block_extent *be; 130 131 while (*p) { 132 parent = *p; 133 be = ext_node(parent); 134 135 if (new->be_f_offset < be->be_f_offset) { 136 if (merge_ok && ext_can_merge(new, be)) { 137 be->be_f_offset = new->be_f_offset; 138 if (be->be_state != PNFS_BLOCK_NONE_DATA) 139 be->be_v_offset = new->be_v_offset; 140 be->be_length += new->be_length; 141 be = ext_try_to_merge_left(root, be); 142 goto free_new; 143 } 144 p = &(*p)->rb_left; 145 } else if (new->be_f_offset >= ext_f_end(be)) { 146 if (merge_ok && ext_can_merge(be, new)) { 147 be->be_length += new->be_length; 148 be = ext_try_to_merge_right(root, be); 149 goto free_new; 150 } 151 p = &(*p)->rb_right; 152 } else { 153 BUG(); 154 } 155 } 156 157 rb_link_node(&new->be_node, parent, p); 158 rb_insert_color(&new->be_node, root); 159 return; 160free_new: 161 nfs4_put_deviceid_node(new->be_device); 162 kfree(new); 163} 164 165static int 166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) 167{ 168 struct pnfs_block_extent *be; 169 sector_t len1 = 0, len2 = 0; 170 sector_t orig_v_offset; 171 sector_t orig_len; 172 173 be = __ext_tree_search(root, start); 174 if (!be) 175 return 0; 176 if (be->be_f_offset >= end) 177 return 0; 178 179 orig_v_offset = be->be_v_offset; 180 orig_len = be->be_length; 181 182 if (start > be->be_f_offset) 183 len1 = start - be->be_f_offset; 184 if (ext_f_end(be) > end) 185 len2 = ext_f_end(be) - end; 186 187 if (len2 > 0) { 188 if (len1 > 0) { 189 struct pnfs_block_extent *new; 190 191 new = kzalloc(sizeof(*new), GFP_ATOMIC); 192 if (!new) 193 return -ENOMEM; 194 195 be->be_length = len1; 196 197 new->be_f_offset = end; 198 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 199 new->be_v_offset = 200 orig_v_offset + orig_len - len2; 201 } 202 new->be_length = len2; 203 new->be_state = be->be_state; 204 new->be_tag = be->be_tag; 205 new->be_device = nfs4_get_deviceid(be->be_device); 206 207 __ext_tree_insert(root, new, true); 208 } else { 209 be->be_f_offset = end; 210 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 211 be->be_v_offset = 212 orig_v_offset + orig_len - len2; 213 } 214 be->be_length = len2; 215 } 216 } else { 217 if (len1 > 0) { 218 be->be_length = len1; 219 be = ext_tree_next(be); 220 } 221 222 while (be && ext_f_end(be) <= end) { 223 struct pnfs_block_extent *next = ext_tree_next(be); 224 225 rb_erase(&be->be_node, root); 226 nfs4_put_deviceid_node(be->be_device); 227 kfree(be); 228 be = next; 229 } 230 231 if (be && be->be_f_offset < end) { 232 len1 = ext_f_end(be) - end; 233 be->be_f_offset = end; 234 if (be->be_state != PNFS_BLOCK_NONE_DATA) 235 be->be_v_offset += be->be_length - len1; 236 be->be_length = len1; 237 } 238 } 239 240 return 0; 241} 242 243int 244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) 245{ 246 struct pnfs_block_extent *be; 247 struct rb_root *root; 248 int err = 0; 249 250 switch (new->be_state) { 251 case PNFS_BLOCK_READWRITE_DATA: 252 case PNFS_BLOCK_INVALID_DATA: 253 root = &bl->bl_ext_rw; 254 break; 255 case PNFS_BLOCK_READ_DATA: 256 case PNFS_BLOCK_NONE_DATA: 257 root = &bl->bl_ext_ro; 258 break; 259 default: 260 dprintk("invalid extent type\n"); 261 return -EINVAL; 262 } 263 264 spin_lock(&bl->bl_ext_lock); 265retry: 266 be = __ext_tree_search(root, new->be_f_offset); 267 if (!be || be->be_f_offset >= ext_f_end(new)) { 268 __ext_tree_insert(root, new, true); 269 } else if (new->be_f_offset >= be->be_f_offset) { 270 if (ext_f_end(new) <= ext_f_end(be)) { 271 nfs4_put_deviceid_node(new->be_device); 272 kfree(new); 273 } else { 274 sector_t new_len = ext_f_end(new) - ext_f_end(be); 275 sector_t diff = new->be_length - new_len; 276 277 new->be_f_offset += diff; 278 new->be_v_offset += diff; 279 new->be_length = new_len; 280 goto retry; 281 } 282 } else if (ext_f_end(new) <= ext_f_end(be)) { 283 new->be_length = be->be_f_offset - new->be_f_offset; 284 __ext_tree_insert(root, new, true); 285 } else { 286 struct pnfs_block_extent *split; 287 sector_t new_len = ext_f_end(new) - ext_f_end(be); 288 sector_t diff = new->be_length - new_len; 289 290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC); 291 if (!split) { 292 err = -EINVAL; 293 goto out; 294 } 295 296 split->be_length = be->be_f_offset - split->be_f_offset; 297 split->be_device = nfs4_get_deviceid(new->be_device); 298 __ext_tree_insert(root, split, true); 299 300 new->be_f_offset += diff; 301 new->be_v_offset += diff; 302 new->be_length = new_len; 303 goto retry; 304 } 305out: 306 spin_unlock(&bl->bl_ext_lock); 307 return err; 308} 309 310static bool 311__ext_tree_lookup(struct rb_root *root, sector_t isect, 312 struct pnfs_block_extent *ret) 313{ 314 struct rb_node *node; 315 struct pnfs_block_extent *be; 316 317 node = root->rb_node; 318 while (node) { 319 be = ext_node(node); 320 if (isect < be->be_f_offset) 321 node = node->rb_left; 322 else if (isect >= ext_f_end(be)) 323 node = node->rb_right; 324 else { 325 *ret = *be; 326 return true; 327 } 328 } 329 330 return false; 331} 332 333bool 334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 335 struct pnfs_block_extent *ret, bool rw) 336{ 337 bool found = false; 338 339 spin_lock(&bl->bl_ext_lock); 340 if (!rw) 341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); 342 if (!found) 343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); 344 spin_unlock(&bl->bl_ext_lock); 345 346 return found; 347} 348 349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, 350 sector_t start, sector_t end) 351{ 352 int err, err2; 353 354 spin_lock(&bl->bl_ext_lock); 355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 356 if (rw) { 357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); 358 if (!err) 359 err = err2; 360 } 361 spin_unlock(&bl->bl_ext_lock); 362 363 return err; 364} 365 366static int 367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, 368 sector_t split) 369{ 370 struct pnfs_block_extent *new; 371 sector_t orig_len = be->be_length; 372 373 new = kzalloc(sizeof(*new), GFP_ATOMIC); 374 if (!new) 375 return -ENOMEM; 376 377 be->be_length = split - be->be_f_offset; 378 379 new->be_f_offset = split; 380 if (be->be_state != PNFS_BLOCK_NONE_DATA) 381 new->be_v_offset = be->be_v_offset + be->be_length; 382 new->be_length = orig_len - be->be_length; 383 new->be_state = be->be_state; 384 new->be_tag = be->be_tag; 385 new->be_device = nfs4_get_deviceid(be->be_device); 386 387 __ext_tree_insert(root, new, false); 388 return 0; 389} 390 391int 392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 393 sector_t len) 394{ 395 struct rb_root *root = &bl->bl_ext_rw; 396 sector_t end = start + len; 397 struct pnfs_block_extent *be; 398 int err = 0; 399 400 spin_lock(&bl->bl_ext_lock); 401 /* 402 * First remove all COW extents or holes from written to range. 403 */ 404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 405 if (err) 406 goto out; 407 408 /* 409 * Then mark all invalid extents in the range as written to. 410 */ 411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { 412 if (be->be_f_offset >= end) 413 break; 414 415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) 416 continue; 417 418 if (be->be_f_offset < start) { 419 struct pnfs_block_extent *left = ext_tree_prev(be); 420 421 if (left && ext_can_merge(left, be)) { 422 sector_t diff = start - be->be_f_offset; 423 424 left->be_length += diff; 425 426 be->be_f_offset += diff; 427 be->be_v_offset += diff; 428 be->be_length -= diff; 429 } else { 430 err = ext_tree_split(root, be, start); 431 if (err) 432 goto out; 433 } 434 } 435 436 if (ext_f_end(be) > end) { 437 struct pnfs_block_extent *right = ext_tree_next(be); 438 439 if (right && ext_can_merge(be, right)) { 440 sector_t diff = end - be->be_f_offset; 441 442 be->be_length -= diff; 443 444 right->be_f_offset -= diff; 445 right->be_v_offset -= diff; 446 right->be_length += diff; 447 } else { 448 err = ext_tree_split(root, be, end); 449 if (err) 450 goto out; 451 } 452 } 453 454 if (be->be_f_offset >= start && ext_f_end(be) <= end) { 455 be->be_tag = EXTENT_WRITTEN; 456 be = ext_try_to_merge_left(root, be); 457 be = ext_try_to_merge_right(root, be); 458 } 459 } 460out: 461 spin_unlock(&bl->bl_ext_lock); 462 return err; 463} 464 465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 466 size_t buffer_size) 467{ 468 if (arg->layoutupdate_pages != &arg->layoutupdate_page) { 469 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; 470 471 for (i = 0; i < nr_pages; i++) 472 put_page(arg->layoutupdate_pages[i]); 473 kfree(arg->layoutupdate_pages); 474 } else { 475 put_page(arg->layoutupdate_page); 476 } 477} 478 479static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 480 size_t buffer_size, size_t *count) 481{ 482 struct pnfs_block_extent *be; 483 int ret = 0; 484 485 spin_lock(&bl->bl_ext_lock); 486 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 487 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 488 be->be_tag != EXTENT_WRITTEN) 489 continue; 490 491 (*count)++; 492 if (*count * BL_EXTENT_SIZE > buffer_size) { 493 /* keep counting.. */ 494 ret = -ENOSPC; 495 continue; 496 } 497 498 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 499 NFS4_DEVICEID4_SIZE); 500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 502 p = xdr_encode_hyper(p, 0LL); 503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 504 505 be->be_tag = EXTENT_COMMITTING; 506 } 507 spin_unlock(&bl->bl_ext_lock); 508 509 return ret; 510} 511 512int 513ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) 514{ 515 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 516 size_t count = 0, buffer_size = PAGE_SIZE; 517 __be32 *start_p; 518 int ret; 519 520 dprintk("%s enter\n", __func__); 521 522 arg->layoutupdate_page = alloc_page(GFP_NOFS); 523 if (!arg->layoutupdate_page) 524 return -ENOMEM; 525 start_p = page_address(arg->layoutupdate_page); 526 arg->layoutupdate_pages = &arg->layoutupdate_page; 527 528retry: 529 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); 530 if (unlikely(ret)) { 531 ext_tree_free_commitdata(arg, buffer_size); 532 533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; 534 count = 0; 535 536 arg->layoutupdate_pages = 537 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 538 sizeof(struct page *), GFP_NOFS); 539 if (!arg->layoutupdate_pages) 540 return -ENOMEM; 541 542 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); 543 if (!start_p) { 544 kfree(arg->layoutupdate_pages); 545 return -ENOMEM; 546 } 547 548 goto retry; 549 } 550 551 *start_p = cpu_to_be32(count); 552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; 553 554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 555 __be32 *p = start_p; 556 int i = 0; 557 558 for (p = start_p; 559 p < start_p + arg->layoutupdate_len; 560 p += PAGE_SIZE) { 561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p); 562 } 563 } 564 565 dprintk("%s found %zu ranges\n", __func__, count); 566 return 0; 567} 568 569void 570ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) 571{ 572 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 573 struct rb_root *root = &bl->bl_ext_rw; 574 struct pnfs_block_extent *be; 575 576 dprintk("%s status %d\n", __func__, status); 577 578 ext_tree_free_commitdata(arg, arg->layoutupdate_len); 579 580 spin_lock(&bl->bl_ext_lock); 581 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { 582 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 583 be->be_tag != EXTENT_COMMITTING) 584 continue; 585 586 if (status) { 587 /* 588 * Mark as written and try again. 589 * 590 * XXX: some real error handling here wouldn't hurt.. 591 */ 592 be->be_tag = EXTENT_WRITTEN; 593 } else { 594 be->be_state = PNFS_BLOCK_READWRITE_DATA; 595 be->be_tag = 0; 596 } 597 598 be = ext_try_to_merge_left(root, be); 599 be = ext_try_to_merge_right(root, be); 600 } 601 spin_unlock(&bl->bl_ext_lock); 602} 603