root/fs/afs/rotate.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. afs_begin_vnode_operation
  2. afs_start_fs_iteration
  3. afs_busy
  4. afs_sleep_and_retry
  5. afs_select_fileserver
  6. afs_select_current_fileserver
  7. afs_dump_edestaddrreq
  8. afs_end_vnode_operation

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* Handle fileserver selection and rotation.
   3  *
   4  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7 
   8 #include <linux/kernel.h>
   9 #include <linux/slab.h>
  10 #include <linux/fs.h>
  11 #include <linux/sched.h>
  12 #include <linux/delay.h>
  13 #include <linux/sched/signal.h>
  14 #include "internal.h"
  15 #include "afs_fs.h"
  16 
  17 /*
  18  * Begin an operation on the fileserver.
  19  *
  20  * Fileserver operations are serialised on the server by vnode, so we serialise
  21  * them here also using the io_lock.
  22  */
  23 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  24                                struct key *key, bool intr)
  25 {
  26         memset(fc, 0, sizeof(*fc));
  27         fc->vnode = vnode;
  28         fc->key = key;
  29         fc->ac.error = SHRT_MAX;
  30         fc->error = -EDESTADDRREQ;
  31 
  32         if (intr) {
  33                 fc->flags |= AFS_FS_CURSOR_INTR;
  34                 if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
  35                         fc->error = -EINTR;
  36                         fc->flags |= AFS_FS_CURSOR_STOP;
  37                         return false;
  38                 }
  39         } else {
  40                 mutex_lock(&vnode->io_lock);
  41         }
  42 
  43         if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
  44                 fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
  45         return true;
  46 }
  47 
  48 /*
  49  * Begin iteration through a server list, starting with the vnode's last used
  50  * server if possible, or the last recorded good server if not.
  51  */
  52 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
  53                                    struct afs_vnode *vnode)
  54 {
  55         struct afs_cb_interest *cbi;
  56         int i;
  57 
  58         read_lock(&vnode->volume->servers_lock);
  59         fc->server_list = afs_get_serverlist(vnode->volume->servers);
  60         read_unlock(&vnode->volume->servers_lock);
  61 
  62         fc->untried = (1UL << fc->server_list->nr_servers) - 1;
  63         fc->index = READ_ONCE(fc->server_list->preferred);
  64 
  65         cbi = rcu_dereference_protected(vnode->cb_interest,
  66                                         lockdep_is_held(&vnode->io_lock));
  67         if (cbi) {
  68                 /* See if the vnode's preferred record is still available */
  69                 for (i = 0; i < fc->server_list->nr_servers; i++) {
  70                         if (fc->server_list->servers[i].cb_interest == cbi) {
  71                                 fc->index = i;
  72                                 goto found_interest;
  73                         }
  74                 }
  75 
  76                 /* If we have a lock outstanding on a server that's no longer
  77                  * serving this vnode, then we can't switch to another server
  78                  * and have to return an error.
  79                  */
  80                 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
  81                         fc->error = -ESTALE;
  82                         return false;
  83                 }
  84 
  85                 /* Note that the callback promise is effectively broken */
  86                 write_seqlock(&vnode->cb_lock);
  87                 ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
  88                 rcu_assign_pointer(vnode->cb_interest, NULL);
  89                 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  90                         vnode->cb_break++;
  91                 write_sequnlock(&vnode->cb_lock);
  92 
  93                 afs_put_cb_interest(afs_v2net(vnode), cbi);
  94                 cbi = NULL;
  95         }
  96 
  97 found_interest:
  98         return true;
  99 }
 100 
 101 /*
 102  * Post volume busy note.
 103  */
 104 static void afs_busy(struct afs_volume *volume, u32 abort_code)
 105 {
 106         const char *m;
 107 
 108         switch (abort_code) {
 109         case VOFFLINE:          m = "offline";          break;
 110         case VRESTARTING:       m = "restarting";       break;
 111         case VSALVAGING:        m = "being salvaged";   break;
 112         default:                m = "busy";             break;
 113         }
 114 
 115         pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
 116 }
 117 
 118 /*
 119  * Sleep and retry the operation to the same fileserver.
 120  */
 121 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
 122 {
 123         if (fc->flags & AFS_FS_CURSOR_INTR) {
 124                 msleep_interruptible(1000);
 125                 if (signal_pending(current)) {
 126                         fc->error = -ERESTARTSYS;
 127                         return false;
 128                 }
 129         } else {
 130                 msleep(1000);
 131         }
 132 
 133         return true;
 134 }
 135 
 136 /*
 137  * Select the fileserver to use.  May be called multiple times to rotate
 138  * through the fileservers.
 139  */
 140 bool afs_select_fileserver(struct afs_fs_cursor *fc)
 141 {
 142         struct afs_addr_list *alist;
 143         struct afs_server *server;
 144         struct afs_vnode *vnode = fc->vnode;
 145         struct afs_error e;
 146         u32 rtt;
 147         int error = fc->ac.error, i;
 148 
 149         _enter("%lx[%d],%lx[%d],%d,%d",
 150                fc->untried, fc->index,
 151                fc->ac.tried, fc->ac.index,
 152                error, fc->ac.abort_code);
 153 
 154         if (fc->flags & AFS_FS_CURSOR_STOP) {
 155                 _leave(" = f [stopped]");
 156                 return false;
 157         }
 158 
 159         fc->nr_iterations++;
 160 
 161         /* Evaluate the result of the previous operation, if there was one. */
 162         switch (error) {
 163         case SHRT_MAX:
 164                 goto start;
 165 
 166         case 0:
 167         default:
 168                 /* Success or local failure.  Stop. */
 169                 fc->error = error;
 170                 fc->flags |= AFS_FS_CURSOR_STOP;
 171                 _leave(" = f [okay/local %d]", error);
 172                 return false;
 173 
 174         case -ECONNABORTED:
 175                 /* The far side rejected the operation on some grounds.  This
 176                  * might involve the server being busy or the volume having been moved.
 177                  */
 178                 switch (fc->ac.abort_code) {
 179                 case VNOVOL:
 180                         /* This fileserver doesn't know about the volume.
 181                          * - May indicate that the VL is wrong - retry once and compare
 182                          *   the results.
 183                          * - May indicate that the fileserver couldn't attach to the vol.
 184                          */
 185                         if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
 186                                 fc->error = -EREMOTEIO;
 187                                 goto next_server;
 188                         }
 189 
 190                         write_lock(&vnode->volume->servers_lock);
 191                         fc->server_list->vnovol_mask |= 1 << fc->index;
 192                         write_unlock(&vnode->volume->servers_lock);
 193 
 194                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
 195                         error = afs_check_volume_status(vnode->volume, fc);
 196                         if (error < 0)
 197                                 goto failed_set_error;
 198 
 199                         if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
 200                                 fc->error = -ENOMEDIUM;
 201                                 goto failed;
 202                         }
 203 
 204                         /* If the server list didn't change, then assume that
 205                          * it's the fileserver having trouble.
 206                          */
 207                         if (vnode->volume->servers == fc->server_list) {
 208                                 fc->error = -EREMOTEIO;
 209                                 goto next_server;
 210                         }
 211 
 212                         /* Try again */
 213                         fc->flags |= AFS_FS_CURSOR_VNOVOL;
 214                         _leave(" = t [vnovol]");
 215                         return true;
 216 
 217                 case VSALVAGE: /* TODO: Should this return an error or iterate? */
 218                 case VVOLEXISTS:
 219                 case VNOSERVICE:
 220                 case VONLINE:
 221                 case VDISKFULL:
 222                 case VOVERQUOTA:
 223                         fc->error = afs_abort_to_error(fc->ac.abort_code);
 224                         goto next_server;
 225 
 226                 case VOFFLINE:
 227                         if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
 228                                 afs_busy(vnode->volume, fc->ac.abort_code);
 229                                 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
 230                         }
 231                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
 232                                 fc->error = -EADV;
 233                                 goto failed;
 234                         }
 235                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
 236                                 fc->error = -ESTALE;
 237                                 goto failed;
 238                         }
 239                         goto busy;
 240 
 241                 case VSALVAGING:
 242                 case VRESTARTING:
 243                 case VBUSY:
 244                         /* Retry after going round all the servers unless we
 245                          * have a file lock we need to maintain.
 246                          */
 247                         if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
 248                                 fc->error = -EBUSY;
 249                                 goto failed;
 250                         }
 251                         if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
 252                                 afs_busy(vnode->volume, fc->ac.abort_code);
 253                                 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 254                         }
 255                 busy:
 256                         if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
 257                                 if (!afs_sleep_and_retry(fc))
 258                                         goto failed;
 259 
 260                                  /* Retry with same server & address */
 261                                 _leave(" = t [vbusy]");
 262                                 return true;
 263                         }
 264 
 265                         fc->flags |= AFS_FS_CURSOR_VBUSY;
 266                         goto next_server;
 267 
 268                 case VMOVED:
 269                         /* The volume migrated to another server.  We consider
 270                          * consider all locks and callbacks broken and request
 271                          * an update from the VLDB.
 272                          *
 273                          * We also limit the number of VMOVED hops we will
 274                          * honour, just in case someone sets up a loop.
 275                          */
 276                         if (fc->flags & AFS_FS_CURSOR_VMOVED) {
 277                                 fc->error = -EREMOTEIO;
 278                                 goto failed;
 279                         }
 280                         fc->flags |= AFS_FS_CURSOR_VMOVED;
 281 
 282                         set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
 283                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
 284                         error = afs_check_volume_status(vnode->volume, fc);
 285                         if (error < 0)
 286                                 goto failed_set_error;
 287 
 288                         /* If the server list didn't change, then the VLDB is
 289                          * out of sync with the fileservers.  This is hopefully
 290                          * a temporary condition, however, so we don't want to
 291                          * permanently block access to the file.
 292                          *
 293                          * TODO: Try other fileservers if we can.
 294                          *
 295                          * TODO: Retry a few times with sleeps.
 296                          */
 297                         if (vnode->volume->servers == fc->server_list) {
 298                                 fc->error = -ENOMEDIUM;
 299                                 goto failed;
 300                         }
 301 
 302                         goto restart_from_beginning;
 303 
 304                 default:
 305                         clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 306                         clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
 307                         fc->error = afs_abort_to_error(fc->ac.abort_code);
 308                         goto failed;
 309                 }
 310 
 311         case -ETIMEDOUT:
 312         case -ETIME:
 313                 if (fc->error != -EDESTADDRREQ)
 314                         goto iterate_address;
 315                 /* Fall through */
 316         case -ERFKILL:
 317         case -EADDRNOTAVAIL:
 318         case -ENETUNREACH:
 319         case -EHOSTUNREACH:
 320         case -EHOSTDOWN:
 321         case -ECONNREFUSED:
 322                 _debug("no conn");
 323                 fc->error = error;
 324                 goto iterate_address;
 325 
 326         case -ECONNRESET:
 327                 _debug("call reset");
 328                 fc->error = error;
 329                 goto failed;
 330         }
 331 
 332 restart_from_beginning:
 333         _debug("restart");
 334         afs_end_cursor(&fc->ac);
 335         afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
 336         fc->cbi = NULL;
 337         afs_put_serverlist(afs_v2net(vnode), fc->server_list);
 338         fc->server_list = NULL;
 339 start:
 340         _debug("start");
 341         /* See if we need to do an update of the volume record.  Note that the
 342          * volume may have moved or even have been deleted.
 343          */
 344         error = afs_check_volume_status(vnode->volume, fc);
 345         if (error < 0)
 346                 goto failed_set_error;
 347 
 348         if (!afs_start_fs_iteration(fc, vnode))
 349                 goto failed;
 350 
 351         _debug("__ VOL %llx __", vnode->volume->vid);
 352         error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
 353         if (error < 0)
 354                 goto failed_set_error;
 355 
 356 pick_server:
 357         _debug("pick [%lx]", fc->untried);
 358 
 359         error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
 360         if (error < 0)
 361                 goto failed_set_error;
 362 
 363         /* Pick the untried server with the lowest RTT.  If we have outstanding
 364          * callbacks, we stick with the server we're already using if we can.
 365          */
 366         if (fc->cbi) {
 367                 _debug("cbi %u", fc->index);
 368                 if (test_bit(fc->index, &fc->untried))
 369                         goto selected_server;
 370                 afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
 371                 fc->cbi = NULL;
 372                 _debug("nocbi");
 373         }
 374 
 375         fc->index = -1;
 376         rtt = U32_MAX;
 377         for (i = 0; i < fc->server_list->nr_servers; i++) {
 378                 struct afs_server *s = fc->server_list->servers[i].server;
 379 
 380                 if (!test_bit(i, &fc->untried) || !s->probe.responded)
 381                         continue;
 382                 if (s->probe.rtt < rtt) {
 383                         fc->index = i;
 384                         rtt = s->probe.rtt;
 385                 }
 386         }
 387 
 388         if (fc->index == -1)
 389                 goto no_more_servers;
 390 
 391 selected_server:
 392         _debug("use %d", fc->index);
 393         __clear_bit(fc->index, &fc->untried);
 394 
 395         /* We're starting on a different fileserver from the list.  We need to
 396          * check it, create a callback intercept, find its address list and
 397          * probe its capabilities before we use it.
 398          */
 399         ASSERTCMP(fc->ac.alist, ==, NULL);
 400         server = fc->server_list->servers[fc->index].server;
 401 
 402         if (!afs_check_server_record(fc, server))
 403                 goto failed;
 404 
 405         _debug("USING SERVER: %pU", &server->uuid);
 406 
 407         /* Make sure we've got a callback interest record for this server.  We
 408          * have to link it in before we send the request as we can be sent a
 409          * break request before we've finished decoding the reply and
 410          * installing the vnode.
 411          */
 412         error = afs_register_server_cb_interest(vnode, fc->server_list,
 413                                                 fc->index);
 414         if (error < 0)
 415                 goto failed_set_error;
 416 
 417         fc->cbi = afs_get_cb_interest(
 418                 rcu_dereference_protected(vnode->cb_interest,
 419                                           lockdep_is_held(&vnode->io_lock)));
 420 
 421         read_lock(&server->fs_lock);
 422         alist = rcu_dereference_protected(server->addresses,
 423                                           lockdep_is_held(&server->fs_lock));
 424         afs_get_addrlist(alist);
 425         read_unlock(&server->fs_lock);
 426 
 427         memset(&fc->ac, 0, sizeof(fc->ac));
 428 
 429         if (!fc->ac.alist)
 430                 fc->ac.alist = alist;
 431         else
 432                 afs_put_addrlist(alist);
 433 
 434         fc->ac.index = -1;
 435 
 436 iterate_address:
 437         ASSERT(fc->ac.alist);
 438         /* Iterate over the current server's address list to try and find an
 439          * address on which it will respond to us.
 440          */
 441         if (!afs_iterate_addresses(&fc->ac))
 442                 goto next_server;
 443 
 444         _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
 445 
 446         _leave(" = t");
 447         return true;
 448 
 449 next_server:
 450         _debug("next");
 451         afs_end_cursor(&fc->ac);
 452         goto pick_server;
 453 
 454 no_more_servers:
 455         /* That's all the servers poked to no good effect.  Try again if some
 456          * of them were busy.
 457          */
 458         if (fc->flags & AFS_FS_CURSOR_VBUSY)
 459                 goto restart_from_beginning;
 460 
 461         e.error = -EDESTADDRREQ;
 462         e.responded = false;
 463         for (i = 0; i < fc->server_list->nr_servers; i++) {
 464                 struct afs_server *s = fc->server_list->servers[i].server;
 465 
 466                 afs_prioritise_error(&e, READ_ONCE(s->probe.error),
 467                                      s->probe.abort_code);
 468         }
 469 
 470         error = e.error;
 471 
 472 failed_set_error:
 473         fc->error = error;
 474 failed:
 475         fc->flags |= AFS_FS_CURSOR_STOP;
 476         afs_end_cursor(&fc->ac);
 477         _leave(" = f [failed %d]", fc->error);
 478         return false;
 479 }
 480 
 481 /*
 482  * Select the same fileserver we used for a vnode before and only that
 483  * fileserver.  We use this when we have a lock on that file, which is backed
 484  * only by the fileserver we obtained it from.
 485  */
 486 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
 487 {
 488         struct afs_vnode *vnode = fc->vnode;
 489         struct afs_cb_interest *cbi;
 490         struct afs_addr_list *alist;
 491         int error = fc->ac.error;
 492 
 493         _enter("");
 494 
 495         cbi = rcu_dereference_protected(vnode->cb_interest,
 496                                         lockdep_is_held(&vnode->io_lock));
 497 
 498         switch (error) {
 499         case SHRT_MAX:
 500                 if (!cbi) {
 501                         fc->error = -ESTALE;
 502                         fc->flags |= AFS_FS_CURSOR_STOP;
 503                         return false;
 504                 }
 505 
 506                 fc->cbi = afs_get_cb_interest(cbi);
 507 
 508                 read_lock(&cbi->server->fs_lock);
 509                 alist = rcu_dereference_protected(cbi->server->addresses,
 510                                                   lockdep_is_held(&cbi->server->fs_lock));
 511                 afs_get_addrlist(alist);
 512                 read_unlock(&cbi->server->fs_lock);
 513                 if (!alist) {
 514                         fc->error = -ESTALE;
 515                         fc->flags |= AFS_FS_CURSOR_STOP;
 516                         return false;
 517                 }
 518 
 519                 memset(&fc->ac, 0, sizeof(fc->ac));
 520                 fc->ac.alist = alist;
 521                 fc->ac.index = -1;
 522                 goto iterate_address;
 523 
 524         case 0:
 525         default:
 526                 /* Success or local failure.  Stop. */
 527                 fc->error = error;
 528                 fc->flags |= AFS_FS_CURSOR_STOP;
 529                 _leave(" = f [okay/local %d]", error);
 530                 return false;
 531 
 532         case -ECONNABORTED:
 533                 fc->error = afs_abort_to_error(fc->ac.abort_code);
 534                 fc->flags |= AFS_FS_CURSOR_STOP;
 535                 _leave(" = f [abort]");
 536                 return false;
 537 
 538         case -ERFKILL:
 539         case -EADDRNOTAVAIL:
 540         case -ENETUNREACH:
 541         case -EHOSTUNREACH:
 542         case -EHOSTDOWN:
 543         case -ECONNREFUSED:
 544         case -ETIMEDOUT:
 545         case -ETIME:
 546                 _debug("no conn");
 547                 fc->error = error;
 548                 goto iterate_address;
 549         }
 550 
 551 iterate_address:
 552         /* Iterate over the current server's address list to try and find an
 553          * address on which it will respond to us.
 554          */
 555         if (afs_iterate_addresses(&fc->ac)) {
 556                 _leave(" = t");
 557                 return true;
 558         }
 559 
 560         afs_end_cursor(&fc->ac);
 561         return false;
 562 }
 563 
 564 /*
 565  * Dump cursor state in the case of the error being EDESTADDRREQ.
 566  */
 567 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
 568 {
 569         static int count;
 570         int i;
 571 
 572         if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
 573                 return;
 574         count++;
 575 
 576         rcu_read_lock();
 577 
 578         pr_notice("EDESTADDR occurred\n");
 579         pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
 580                   fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
 581         pr_notice("FC: ut=%lx ix=%d ni=%u\n",
 582                   fc->untried, fc->index, fc->nr_iterations);
 583 
 584         if (fc->server_list) {
 585                 const struct afs_server_list *sl = fc->server_list;
 586                 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
 587                           sl->nr_servers, sl->preferred, sl->vnovol_mask);
 588                 for (i = 0; i < sl->nr_servers; i++) {
 589                         const struct afs_server *s = sl->servers[i].server;
 590                         pr_notice("FC: server fl=%lx av=%u %pU\n",
 591                                   s->flags, s->addr_version, &s->uuid);
 592                         if (s->addresses) {
 593                                 const struct afs_addr_list *a =
 594                                         rcu_dereference(s->addresses);
 595                                 pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
 596                                           a->version,
 597                                           a->nr_ipv4, a->nr_addrs, a->max_addrs,
 598                                           a->preferred);
 599                                 pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
 600                                           a->probed, a->responded, a->failed);
 601                                 if (a == fc->ac.alist)
 602                                         pr_notice("FC:  - current\n");
 603                         }
 604                 }
 605         }
 606 
 607         pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
 608                   fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
 609                   fc->ac.responded, fc->ac.nr_iterations);
 610         rcu_read_unlock();
 611 }
 612 
 613 /*
 614  * Tidy up a filesystem cursor and unlock the vnode.
 615  */
 616 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
 617 {
 618         struct afs_net *net = afs_v2net(fc->vnode);
 619 
 620         if (fc->error == -EDESTADDRREQ ||
 621             fc->error == -EADDRNOTAVAIL ||
 622             fc->error == -ENETUNREACH ||
 623             fc->error == -EHOSTUNREACH)
 624                 afs_dump_edestaddrreq(fc);
 625 
 626         mutex_unlock(&fc->vnode->io_lock);
 627 
 628         afs_end_cursor(&fc->ac);
 629         afs_put_cb_interest(net, fc->cbi);
 630         afs_put_serverlist(net, fc->server_list);
 631 
 632         if (fc->error == -ECONNABORTED)
 633                 fc->error = afs_abort_to_error(fc->ac.abort_code);
 634 
 635         return fc->error;
 636 }

/* [<][>][^][v][top][bottom][index][help] */