root/fs/nfs/blocklayout/extent_tree.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ext_node
  2. ext_tree_first
  3. ext_tree_prev
  4. ext_tree_next
  5. ext_f_end
  6. __ext_tree_search
  7. ext_can_merge
  8. ext_try_to_merge_left
  9. ext_try_to_merge_right
  10. __ext_put_deviceids
  11. __ext_tree_insert
  12. __ext_tree_remove
  13. ext_tree_insert
  14. __ext_tree_lookup
  15. ext_tree_lookup
  16. ext_tree_remove
  17. ext_tree_split
  18. ext_tree_mark_written
  19. ext_tree_layoutupdate_size
  20. ext_tree_free_commitdata
  21. encode_block_extent
  22. encode_scsi_range
  23. ext_tree_encode_commit
  24. ext_tree_prepare_commit
  25. ext_tree_mark_committed

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2014-2016 Christoph Hellwig.
   4  */
   5 
   6 #include <linux/vmalloc.h>
   7 
   8 #include "blocklayout.h"
   9 
  10 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  11 
  12 static inline struct pnfs_block_extent *
  13 ext_node(struct rb_node *node)
  14 {
  15         return rb_entry(node, struct pnfs_block_extent, be_node);
  16 }
  17 
  18 static struct pnfs_block_extent *
  19 ext_tree_first(struct rb_root *root)
  20 {
  21         struct rb_node *node = rb_first(root);
  22         return node ? ext_node(node) : NULL;
  23 }
  24 
  25 static struct pnfs_block_extent *
  26 ext_tree_prev(struct pnfs_block_extent *be)
  27 {
  28         struct rb_node *node = rb_prev(&be->be_node);
  29         return node ? ext_node(node) : NULL;
  30 }
  31 
  32 static struct pnfs_block_extent *
  33 ext_tree_next(struct pnfs_block_extent *be)
  34 {
  35         struct rb_node *node = rb_next(&be->be_node);
  36         return node ? ext_node(node) : NULL;
  37 }
  38 
  39 static inline sector_t
  40 ext_f_end(struct pnfs_block_extent *be)
  41 {
  42         return be->be_f_offset + be->be_length;
  43 }
  44 
  45 static struct pnfs_block_extent *
  46 __ext_tree_search(struct rb_root *root, sector_t start)
  47 {
  48         struct rb_node *node = root->rb_node;
  49         struct pnfs_block_extent *be = NULL;
  50 
  51         while (node) {
  52                 be = ext_node(node);
  53                 if (start < be->be_f_offset)
  54                         node = node->rb_left;
  55                 else if (start >= ext_f_end(be))
  56                         node = node->rb_right;
  57                 else
  58                         return be;
  59         }
  60 
  61         if (be) {
  62                 if (start < be->be_f_offset)
  63                         return be;
  64 
  65                 if (start >= ext_f_end(be))
  66                         return ext_tree_next(be);
  67         }
  68 
  69         return NULL;
  70 }
  71 
  72 static bool
  73 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  74 {
  75         if (be1->be_state != be2->be_state)
  76                 return false;
  77         if (be1->be_device != be2->be_device)
  78                 return false;
  79 
  80         if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  81                 return false;
  82 
  83         if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  84             (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  85                 return false;
  86 
  87         if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  88             be1->be_tag != be2->be_tag)
  89                 return false;
  90 
  91         return true;
  92 }
  93 
  94 static struct pnfs_block_extent *
  95 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  96 {
  97         struct pnfs_block_extent *left = ext_tree_prev(be);
  98 
  99         if (left && ext_can_merge(left, be)) {
 100                 left->be_length += be->be_length;
 101                 rb_erase(&be->be_node, root);
 102                 nfs4_put_deviceid_node(be->be_device);
 103                 kfree(be);
 104                 return left;
 105         }
 106 
 107         return be;
 108 }
 109 
 110 static struct pnfs_block_extent *
 111 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 112 {
 113         struct pnfs_block_extent *right = ext_tree_next(be);
 114 
 115         if (right && ext_can_merge(be, right)) {
 116                 be->be_length += right->be_length;
 117                 rb_erase(&right->be_node, root);
 118                 nfs4_put_deviceid_node(right->be_device);
 119                 kfree(right);
 120         }
 121 
 122         return be;
 123 }
 124 
 125 static void __ext_put_deviceids(struct list_head *head)
 126 {
 127         struct pnfs_block_extent *be, *tmp;
 128 
 129         list_for_each_entry_safe(be, tmp, head, be_list) {
 130                 nfs4_put_deviceid_node(be->be_device);
 131                 kfree(be);
 132         }
 133 }
 134 
 135 static void
 136 __ext_tree_insert(struct rb_root *root,
 137                 struct pnfs_block_extent *new, bool merge_ok)
 138 {
 139         struct rb_node **p = &root->rb_node, *parent = NULL;
 140         struct pnfs_block_extent *be;
 141 
 142         while (*p) {
 143                 parent = *p;
 144                 be = ext_node(parent);
 145 
 146                 if (new->be_f_offset < be->be_f_offset) {
 147                         if (merge_ok && ext_can_merge(new, be)) {
 148                                 be->be_f_offset = new->be_f_offset;
 149                                 if (be->be_state != PNFS_BLOCK_NONE_DATA)
 150                                         be->be_v_offset = new->be_v_offset;
 151                                 be->be_length += new->be_length;
 152                                 be = ext_try_to_merge_left(root, be);
 153                                 goto free_new;
 154                         }
 155                         p = &(*p)->rb_left;
 156                 } else if (new->be_f_offset >= ext_f_end(be)) {
 157                         if (merge_ok && ext_can_merge(be, new)) {
 158                                 be->be_length += new->be_length;
 159                                 be = ext_try_to_merge_right(root, be);
 160                                 goto free_new;
 161                         }
 162                         p = &(*p)->rb_right;
 163                 } else {
 164                         BUG();
 165                 }
 166         }
 167 
 168         rb_link_node(&new->be_node, parent, p);
 169         rb_insert_color(&new->be_node, root);
 170         return;
 171 free_new:
 172         nfs4_put_deviceid_node(new->be_device);
 173         kfree(new);
 174 }
 175 
 176 static int
 177 __ext_tree_remove(struct rb_root *root,
 178                 sector_t start, sector_t end, struct list_head *tmp)
 179 {
 180         struct pnfs_block_extent *be;
 181         sector_t len1 = 0, len2 = 0;
 182         sector_t orig_v_offset;
 183         sector_t orig_len;
 184 
 185         be = __ext_tree_search(root, start);
 186         if (!be)
 187                 return 0;
 188         if (be->be_f_offset >= end)
 189                 return 0;
 190 
 191         orig_v_offset = be->be_v_offset;
 192         orig_len = be->be_length;
 193 
 194         if (start > be->be_f_offset)
 195                 len1 = start - be->be_f_offset;
 196         if (ext_f_end(be) > end)
 197                 len2 = ext_f_end(be) - end;
 198 
 199         if (len2 > 0) {
 200                 if (len1 > 0) {
 201                         struct pnfs_block_extent *new;
 202 
 203                         new = kzalloc(sizeof(*new), GFP_ATOMIC);
 204                         if (!new)
 205                                 return -ENOMEM;
 206 
 207                         be->be_length = len1;
 208 
 209                         new->be_f_offset = end;
 210                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 211                                 new->be_v_offset =
 212                                         orig_v_offset + orig_len - len2;
 213                         }
 214                         new->be_length = len2;
 215                         new->be_state = be->be_state;
 216                         new->be_tag = be->be_tag;
 217                         new->be_device = nfs4_get_deviceid(be->be_device);
 218 
 219                         __ext_tree_insert(root, new, true);
 220                 } else {
 221                         be->be_f_offset = end;
 222                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 223                                 be->be_v_offset =
 224                                         orig_v_offset + orig_len - len2;
 225                         }
 226                         be->be_length = len2;
 227                 }
 228         } else {
 229                 if (len1 > 0) {
 230                         be->be_length = len1;
 231                         be = ext_tree_next(be);
 232                 }
 233 
 234                 while (be && ext_f_end(be) <= end) {
 235                         struct pnfs_block_extent *next = ext_tree_next(be);
 236 
 237                         rb_erase(&be->be_node, root);
 238                         list_add_tail(&be->be_list, tmp);
 239                         be = next;
 240                 }
 241 
 242                 if (be && be->be_f_offset < end) {
 243                         len1 = ext_f_end(be) - end;
 244                         be->be_f_offset = end;
 245                         if (be->be_state != PNFS_BLOCK_NONE_DATA)
 246                                 be->be_v_offset += be->be_length - len1;
 247                         be->be_length = len1;
 248                 }
 249         }
 250 
 251         return 0;
 252 }
 253 
 254 int
 255 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
 256 {
 257         struct pnfs_block_extent *be;
 258         struct rb_root *root;
 259         int err = 0;
 260 
 261         switch (new->be_state) {
 262         case PNFS_BLOCK_READWRITE_DATA:
 263         case PNFS_BLOCK_INVALID_DATA:
 264                 root = &bl->bl_ext_rw;
 265                 break;
 266         case PNFS_BLOCK_READ_DATA:
 267         case PNFS_BLOCK_NONE_DATA:
 268                 root = &bl->bl_ext_ro;
 269                 break;
 270         default:
 271                 dprintk("invalid extent type\n");
 272                 return -EINVAL;
 273         }
 274 
 275         spin_lock(&bl->bl_ext_lock);
 276 retry:
 277         be = __ext_tree_search(root, new->be_f_offset);
 278         if (!be || be->be_f_offset >= ext_f_end(new)) {
 279                 __ext_tree_insert(root, new, true);
 280         } else if (new->be_f_offset >= be->be_f_offset) {
 281                 if (ext_f_end(new) <= ext_f_end(be)) {
 282                         nfs4_put_deviceid_node(new->be_device);
 283                         kfree(new);
 284                 } else {
 285                         sector_t new_len = ext_f_end(new) - ext_f_end(be);
 286                         sector_t diff = new->be_length - new_len;
 287 
 288                         new->be_f_offset += diff;
 289                         new->be_v_offset += diff;
 290                         new->be_length = new_len;
 291                         goto retry;
 292                 }
 293         } else if (ext_f_end(new) <= ext_f_end(be)) {
 294                 new->be_length = be->be_f_offset - new->be_f_offset;
 295                 __ext_tree_insert(root, new, true);
 296         } else {
 297                 struct pnfs_block_extent *split;
 298                 sector_t new_len = ext_f_end(new) - ext_f_end(be);
 299                 sector_t diff = new->be_length - new_len;
 300 
 301                 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
 302                 if (!split) {
 303                         err = -EINVAL;
 304                         goto out;
 305                 }
 306 
 307                 split->be_length = be->be_f_offset - split->be_f_offset;
 308                 split->be_device = nfs4_get_deviceid(new->be_device);
 309                 __ext_tree_insert(root, split, true);
 310 
 311                 new->be_f_offset += diff;
 312                 new->be_v_offset += diff;
 313                 new->be_length = new_len;
 314                 goto retry;
 315         }
 316 out:
 317         spin_unlock(&bl->bl_ext_lock);
 318         return err;
 319 }
 320 
 321 static bool
 322 __ext_tree_lookup(struct rb_root *root, sector_t isect,
 323                 struct pnfs_block_extent *ret)
 324 {
 325         struct rb_node *node;
 326         struct pnfs_block_extent *be;
 327 
 328         node = root->rb_node;
 329         while (node) {
 330                 be = ext_node(node);
 331                 if (isect < be->be_f_offset)
 332                         node = node->rb_left;
 333                 else if (isect >= ext_f_end(be))
 334                         node = node->rb_right;
 335                 else {
 336                         *ret = *be;
 337                         return true;
 338                 }
 339         }
 340 
 341         return false;
 342 }
 343 
 344 bool
 345 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 346             struct pnfs_block_extent *ret, bool rw)
 347 {
 348         bool found = false;
 349 
 350         spin_lock(&bl->bl_ext_lock);
 351         if (!rw)
 352                 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
 353         if (!found)
 354                 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
 355         spin_unlock(&bl->bl_ext_lock);
 356 
 357         return found;
 358 }
 359 
 360 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
 361                 sector_t start, sector_t end)
 362 {
 363         int err, err2;
 364         LIST_HEAD(tmp);
 365 
 366         spin_lock(&bl->bl_ext_lock);
 367         err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 368         if (rw) {
 369                 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
 370                 if (!err)
 371                         err = err2;
 372         }
 373         spin_unlock(&bl->bl_ext_lock);
 374 
 375         __ext_put_deviceids(&tmp);
 376         return err;
 377 }
 378 
 379 static int
 380 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 381                 sector_t split)
 382 {
 383         struct pnfs_block_extent *new;
 384         sector_t orig_len = be->be_length;
 385 
 386         new = kzalloc(sizeof(*new), GFP_ATOMIC);
 387         if (!new)
 388                 return -ENOMEM;
 389 
 390         be->be_length = split - be->be_f_offset;
 391 
 392         new->be_f_offset = split;
 393         if (be->be_state != PNFS_BLOCK_NONE_DATA)
 394                 new->be_v_offset = be->be_v_offset + be->be_length;
 395         new->be_length = orig_len - be->be_length;
 396         new->be_state = be->be_state;
 397         new->be_tag = be->be_tag;
 398         new->be_device = nfs4_get_deviceid(be->be_device);
 399 
 400         __ext_tree_insert(root, new, false);
 401         return 0;
 402 }
 403 
 404 int
 405 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 406                 sector_t len, u64 lwb)
 407 {
 408         struct rb_root *root = &bl->bl_ext_rw;
 409         sector_t end = start + len;
 410         struct pnfs_block_extent *be;
 411         int err = 0;
 412         LIST_HEAD(tmp);
 413 
 414         spin_lock(&bl->bl_ext_lock);
 415         /*
 416          * First remove all COW extents or holes from written to range.
 417          */
 418         err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 419         if (err)
 420                 goto out;
 421 
 422         /*
 423          * Then mark all invalid extents in the range as written to.
 424          */
 425         for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
 426                 if (be->be_f_offset >= end)
 427                         break;
 428 
 429                 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
 430                         continue;
 431 
 432                 if (be->be_f_offset < start) {
 433                         struct pnfs_block_extent *left = ext_tree_prev(be);
 434 
 435                         if (left && ext_can_merge(left, be)) {
 436                                 sector_t diff = start - be->be_f_offset;
 437 
 438                                 left->be_length += diff;
 439 
 440                                 be->be_f_offset += diff;
 441                                 be->be_v_offset += diff;
 442                                 be->be_length -= diff;
 443                         } else {
 444                                 err = ext_tree_split(root, be, start);
 445                                 if (err)
 446                                         goto out;
 447                         }
 448                 }
 449 
 450                 if (ext_f_end(be) > end) {
 451                         struct pnfs_block_extent *right = ext_tree_next(be);
 452 
 453                         if (right && ext_can_merge(be, right)) {
 454                                 sector_t diff = end - be->be_f_offset;
 455 
 456                                 be->be_length -= diff;
 457 
 458                                 right->be_f_offset -= diff;
 459                                 right->be_v_offset -= diff;
 460                                 right->be_length += diff;
 461                         } else {
 462                                 err = ext_tree_split(root, be, end);
 463                                 if (err)
 464                                         goto out;
 465                         }
 466                 }
 467 
 468                 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
 469                         be->be_tag = EXTENT_WRITTEN;
 470                         be = ext_try_to_merge_left(root, be);
 471                         be = ext_try_to_merge_right(root, be);
 472                 }
 473         }
 474 out:
 475         if (bl->bl_lwb < lwb)
 476                 bl->bl_lwb = lwb;
 477         spin_unlock(&bl->bl_ext_lock);
 478 
 479         __ext_put_deviceids(&tmp);
 480         return err;
 481 }
 482 
 483 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 484 {
 485         if (bl->bl_scsi_layout)
 486                 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
 487         else
 488                 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 489 }
 490 
 491 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 492                 size_t buffer_size)
 493 {
 494         if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
 495                 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
 496 
 497                 for (i = 0; i < nr_pages; i++)
 498                         put_page(arg->layoutupdate_pages[i]);
 499                 vfree(arg->start_p);
 500                 kfree(arg->layoutupdate_pages);
 501         } else {
 502                 put_page(arg->layoutupdate_page);
 503         }
 504 }
 505 
 506 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
 507 {
 508         p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
 509                         NFS4_DEVICEID4_SIZE);
 510         p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 511         p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 512         p = xdr_encode_hyper(p, 0LL);
 513         *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 514         return p;
 515 }
 516 
 517 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 518 {
 519         p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 520         return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 521 }
 522 
 523 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 524                 size_t buffer_size, size_t *count, __u64 *lastbyte)
 525 {
 526         struct pnfs_block_extent *be;
 527         int ret = 0;
 528 
 529         spin_lock(&bl->bl_ext_lock);
 530         for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
 531                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 532                     be->be_tag != EXTENT_WRITTEN)
 533                         continue;
 534 
 535                 (*count)++;
 536                 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 537                         /* keep counting.. */
 538                         ret = -ENOSPC;
 539                         continue;
 540                 }
 541 
 542                 if (bl->bl_scsi_layout)
 543                         p = encode_scsi_range(be, p);
 544                 else
 545                         p = encode_block_extent(be, p);
 546                 be->be_tag = EXTENT_COMMITTING;
 547         }
 548         *lastbyte = bl->bl_lwb - 1;
 549         bl->bl_lwb = 0;
 550         spin_unlock(&bl->bl_ext_lock);
 551 
 552         return ret;
 553 }
 554 
 555 int
 556 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 557 {
 558         struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 559         size_t count = 0, buffer_size = PAGE_SIZE;
 560         __be32 *start_p;
 561         int ret;
 562 
 563         dprintk("%s enter\n", __func__);
 564 
 565         arg->layoutupdate_page = alloc_page(GFP_NOFS);
 566         if (!arg->layoutupdate_page)
 567                 return -ENOMEM;
 568         start_p = page_address(arg->layoutupdate_page);
 569         arg->layoutupdate_pages = &arg->layoutupdate_page;
 570 
 571 retry:
 572         ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
 573         if (unlikely(ret)) {
 574                 ext_tree_free_commitdata(arg, buffer_size);
 575 
 576                 buffer_size = ext_tree_layoutupdate_size(bl, count);
 577                 count = 0;
 578 
 579                 arg->layoutupdate_pages =
 580                         kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
 581                                 sizeof(struct page *), GFP_NOFS);
 582                 if (!arg->layoutupdate_pages)
 583                         return -ENOMEM;
 584 
 585                 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
 586                 if (!start_p) {
 587                         kfree(arg->layoutupdate_pages);
 588                         return -ENOMEM;
 589                 }
 590 
 591                 goto retry;
 592         }
 593 
 594         *start_p = cpu_to_be32(count);
 595         arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 596 
 597         if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 598                 void *p = start_p, *end = p + arg->layoutupdate_len;
 599                 struct page *page = NULL;
 600                 int i = 0;
 601 
 602                 arg->start_p = start_p;
 603                 for ( ; p < end; p += PAGE_SIZE) {
 604                         page = vmalloc_to_page(p);
 605                         arg->layoutupdate_pages[i++] = page;
 606                         get_page(page);
 607                 }
 608         }
 609 
 610         dprintk("%s found %zu ranges\n", __func__, count);
 611         return 0;
 612 }
 613 
 614 void
 615 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
 616 {
 617         struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 618         struct rb_root *root = &bl->bl_ext_rw;
 619         struct pnfs_block_extent *be;
 620 
 621         dprintk("%s status %d\n", __func__, status);
 622 
 623         ext_tree_free_commitdata(arg, arg->layoutupdate_len);
 624 
 625         spin_lock(&bl->bl_ext_lock);
 626         for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
 627                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 628                     be->be_tag != EXTENT_COMMITTING)
 629                         continue;
 630 
 631                 if (status) {
 632                         /*
 633                          * Mark as written and try again.
 634                          *
 635                          * XXX: some real error handling here wouldn't hurt..
 636                          */
 637                         be->be_tag = EXTENT_WRITTEN;
 638                 } else {
 639                         be->be_state = PNFS_BLOCK_READWRITE_DATA;
 640                         be->be_tag = 0;
 641                 }
 642 
 643                 be = ext_try_to_merge_left(root, be);
 644                 be = ext_try_to_merge_right(root, be);
 645         }
 646         spin_unlock(&bl->bl_ext_lock);
 647 }

/* [<][>][^][v][top][bottom][index][help] */