root/fs/nfs/direct.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. get_dreq
  2. put_dreq
  3. nfs_direct_handle_truncated
  4. nfs_direct_count_bytes
  5. nfs_direct_select_verf
  6. nfs_direct_set_hdr_verf
  7. nfs_direct_cmp_verf
  8. nfs_direct_set_or_cmp_hdr_verf
  9. nfs_direct_cmp_commit_data_verf
  10. nfs_direct_IO
  11. nfs_direct_release_pages
  12. nfs_init_cinfo_from_dreq
  13. nfs_direct_req_alloc
  14. nfs_direct_req_free
  15. nfs_direct_req_release
  16. nfs_dreq_bytes_left
  17. nfs_direct_wait
  18. nfs_direct_complete
  19. nfs_direct_read_completion
  20. nfs_read_sync_pgio_error
  21. nfs_direct_pgio_init
  22. nfs_direct_read_schedule_iovec
  23. nfs_file_direct_read
  24. nfs_direct_write_scan_commit_list
  25. nfs_direct_write_reschedule
  26. nfs_direct_commit_complete
  27. nfs_direct_resched_write
  28. nfs_direct_commit_schedule
  29. nfs_direct_write_schedule_work
  30. nfs_direct_write_complete
  31. nfs_direct_write_completion
  32. nfs_write_sync_pgio_error
  33. nfs_direct_write_reschedule_io
  34. nfs_direct_write_schedule_iovec
  35. nfs_file_direct_write
  36. nfs_init_directcache
  37. nfs_destroy_directcache

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * linux/fs/nfs/direct.c
   4  *
   5  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
   6  *
   7  * High-performance uncached I/O for the Linux NFS client
   8  *
   9  * There are important applications whose performance or correctness
  10  * depends on uncached access to file data.  Database clusters
  11  * (multiple copies of the same instance running on separate hosts)
  12  * implement their own cache coherency protocol that subsumes file
  13  * system cache protocols.  Applications that process datasets
  14  * considerably larger than the client's memory do not always benefit
  15  * from a local cache.  A streaming video server, for instance, has no
  16  * need to cache the contents of a file.
  17  *
  18  * When an application requests uncached I/O, all read and write requests
  19  * are made directly to the server; data stored or fetched via these
  20  * requests is not cached in the Linux page cache.  The client does not
  21  * correct unaligned requests from applications.  All requested bytes are
  22  * held on permanent storage before a direct write system call returns to
  23  * an application.
  24  *
  25  * Solaris implements an uncached I/O facility called directio() that
  26  * is used for backups and sequential I/O to very large files.  Solaris
  27  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
  28  * an undocumented mount option.
  29  *
  30  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
  31  * help from Andrew Morton.
  32  *
  33  * 18 Dec 2001  Initial implementation for 2.4  --cel
  34  * 08 Jul 2002  Version for 2.4.19, with bug fixes --trondmy
  35  * 08 Jun 2003  Port to 2.5 APIs  --cel
  36  * 31 Mar 2004  Handle direct I/O without VFS support  --cel
  37  * 15 Sep 2004  Parallel async reads  --cel
  38  * 04 May 2005  support O_DIRECT with aio  --cel
  39  *
  40  */
  41 
  42 #include <linux/errno.h>
  43 #include <linux/sched.h>
  44 #include <linux/kernel.h>
  45 #include <linux/file.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/kref.h>
  48 #include <linux/slab.h>
  49 #include <linux/task_io_accounting_ops.h>
  50 #include <linux/module.h>
  51 
  52 #include <linux/nfs_fs.h>
  53 #include <linux/nfs_page.h>
  54 #include <linux/sunrpc/clnt.h>
  55 
  56 #include <linux/uaccess.h>
  57 #include <linux/atomic.h>
  58 
  59 #include "internal.h"
  60 #include "iostat.h"
  61 #include "pnfs.h"
  62 
  63 #define NFSDBG_FACILITY         NFSDBG_VFS
  64 
  65 static struct kmem_cache *nfs_direct_cachep;
  66 
  67 struct nfs_direct_req {
  68         struct kref             kref;           /* release manager */
  69 
  70         /* I/O parameters */
  71         struct nfs_open_context *ctx;           /* file open context info */
  72         struct nfs_lock_context *l_ctx;         /* Lock context info */
  73         struct kiocb *          iocb;           /* controlling i/o request */
  74         struct inode *          inode;          /* target file of i/o */
  75 
  76         /* completion state */
  77         atomic_t                io_count;       /* i/os we're waiting for */
  78         spinlock_t              lock;           /* protect completion state */
  79 
  80         loff_t                  io_start;       /* Start offset for I/O */
  81         ssize_t                 count,          /* bytes actually processed */
  82                                 max_count,      /* max expected count */
  83                                 bytes_left,     /* bytes left to be sent */
  84                                 error;          /* any reported error */
  85         struct completion       completion;     /* wait for i/o completion */
  86 
  87         /* commit state */
  88         struct nfs_mds_commit_info mds_cinfo;   /* Storage for cinfo */
  89         struct pnfs_ds_commit_info ds_cinfo;    /* Storage for cinfo */
  90         struct work_struct      work;
  91         int                     flags;
  92         /* for write */
  93 #define NFS_ODIRECT_DO_COMMIT           (1)     /* an unstable reply was received */
  94 #define NFS_ODIRECT_RESCHED_WRITES      (2)     /* write verification failed */
  95         /* for read */
  96 #define NFS_ODIRECT_SHOULD_DIRTY        (3)     /* dirty user-space page after read */
  97         struct nfs_writeverf    verf;           /* unstable write verifier */
  98 };
  99 
 100 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
 101 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
 102 static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
 103 static void nfs_direct_write_schedule_work(struct work_struct *work);
 104 
 105 static inline void get_dreq(struct nfs_direct_req *dreq)
 106 {
 107         atomic_inc(&dreq->io_count);
 108 }
 109 
 110 static inline int put_dreq(struct nfs_direct_req *dreq)
 111 {
 112         return atomic_dec_and_test(&dreq->io_count);
 113 }
 114 
 115 static void
 116 nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
 117                             const struct nfs_pgio_header *hdr,
 118                             ssize_t dreq_len)
 119 {
 120         if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
 121               test_bit(NFS_IOHDR_EOF, &hdr->flags)))
 122                 return;
 123         if (dreq->max_count >= dreq_len) {
 124                 dreq->max_count = dreq_len;
 125                 if (dreq->count > dreq_len)
 126                         dreq->count = dreq_len;
 127 
 128                 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
 129                         dreq->error = hdr->error;
 130                 else /* Clear outstanding error if this is EOF */
 131                         dreq->error = 0;
 132         }
 133 }
 134 
 135 static void
 136 nfs_direct_count_bytes(struct nfs_direct_req *dreq,
 137                        const struct nfs_pgio_header *hdr)
 138 {
 139         loff_t hdr_end = hdr->io_start + hdr->good_bytes;
 140         ssize_t dreq_len = 0;
 141 
 142         if (hdr_end > dreq->io_start)
 143                 dreq_len = hdr_end - dreq->io_start;
 144 
 145         nfs_direct_handle_truncated(dreq, hdr, dreq_len);
 146 
 147         if (dreq_len > dreq->max_count)
 148                 dreq_len = dreq->max_count;
 149 
 150         if (dreq->count < dreq_len)
 151                 dreq->count = dreq_len;
 152 }
 153 
 154 /*
 155  * nfs_direct_select_verf - select the right verifier
 156  * @dreq - direct request possibly spanning multiple servers
 157  * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
 158  * @commit_idx - commit bucket index for the DS
 159  *
 160  * returns the correct verifier to use given the role of the server
 161  */
 162 static struct nfs_writeverf *
 163 nfs_direct_select_verf(struct nfs_direct_req *dreq,
 164                        struct nfs_client *ds_clp,
 165                        int commit_idx)
 166 {
 167         struct nfs_writeverf *verfp = &dreq->verf;
 168 
 169 #ifdef CONFIG_NFS_V4_1
 170         /*
 171          * pNFS is in use, use the DS verf except commit_through_mds is set
 172          * for layout segment where nbuckets is zero.
 173          */
 174         if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
 175                 if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
 176                         verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
 177                 else
 178                         WARN_ON_ONCE(1);
 179         }
 180 #endif
 181         return verfp;
 182 }
 183 
 184 
 185 /*
 186  * nfs_direct_set_hdr_verf - set the write/commit verifier
 187  * @dreq - direct request possibly spanning multiple servers
 188  * @hdr - pageio header to validate against previously seen verfs
 189  *
 190  * Set the server's (MDS or DS) "seen" verifier
 191  */
 192 static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 193                                     struct nfs_pgio_header *hdr)
 194 {
 195         struct nfs_writeverf *verfp;
 196 
 197         verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
 198         WARN_ON_ONCE(verfp->committed >= 0);
 199         memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 200         WARN_ON_ONCE(verfp->committed < 0);
 201 }
 202 
 203 static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
 204                 const struct nfs_writeverf *v2)
 205 {
 206         return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
 207 }
 208 
 209 /*
 210  * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
 211  * @dreq - direct request possibly spanning multiple servers
 212  * @hdr - pageio header to validate against previously seen verf
 213  *
 214  * set the server's "seen" verf if not initialized.
 215  * returns result of comparison between @hdr->verf and the "seen"
 216  * verf of the server used by @hdr (DS or MDS)
 217  */
 218 static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 219                                           struct nfs_pgio_header *hdr)
 220 {
 221         struct nfs_writeverf *verfp;
 222 
 223         verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
 224         if (verfp->committed < 0) {
 225                 nfs_direct_set_hdr_verf(dreq, hdr);
 226                 return 0;
 227         }
 228         return nfs_direct_cmp_verf(verfp, &hdr->verf);
 229 }
 230 
 231 /*
 232  * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
 233  * @dreq - direct request possibly spanning multiple servers
 234  * @data - commit data to validate against previously seen verf
 235  *
 236  * returns result of comparison between @data->verf and the verf of
 237  * the server used by @data (DS or MDS)
 238  */
 239 static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 240                                            struct nfs_commit_data *data)
 241 {
 242         struct nfs_writeverf *verfp;
 243 
 244         verfp = nfs_direct_select_verf(dreq, data->ds_clp,
 245                                          data->ds_commit_index);
 246 
 247         /* verifier not set so always fail */
 248         if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
 249                 return 1;
 250 
 251         return nfs_direct_cmp_verf(verfp, data->res.verf);
 252 }
 253 
 254 /**
 255  * nfs_direct_IO - NFS address space operation for direct I/O
 256  * @iocb: target I/O control block
 257  * @iter: I/O buffer
 258  *
 259  * The presence of this routine in the address space ops vector means
 260  * the NFS client supports direct I/O. However, for most direct IO, we
 261  * shunt off direct read and write requests before the VFS gets them,
 262  * so this method is only ever called for swap.
 263  */
 264 ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 265 {
 266         struct inode *inode = iocb->ki_filp->f_mapping->host;
 267 
 268         /* we only support swap file calling nfs_direct_IO */
 269         if (!IS_SWAPFILE(inode))
 270                 return 0;
 271 
 272         VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 273 
 274         if (iov_iter_rw(iter) == READ)
 275                 return nfs_file_direct_read(iocb, iter);
 276         return nfs_file_direct_write(iocb, iter);
 277 }
 278 
 279 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 280 {
 281         unsigned int i;
 282         for (i = 0; i < npages; i++)
 283                 put_page(pages[i]);
 284 }
 285 
 286 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 287                               struct nfs_direct_req *dreq)
 288 {
 289         cinfo->inode = dreq->inode;
 290         cinfo->mds = &dreq->mds_cinfo;
 291         cinfo->ds = &dreq->ds_cinfo;
 292         cinfo->dreq = dreq;
 293         cinfo->completion_ops = &nfs_direct_commit_completion_ops;
 294 }
 295 
 296 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 297 {
 298         struct nfs_direct_req *dreq;
 299 
 300         dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
 301         if (!dreq)
 302                 return NULL;
 303 
 304         kref_init(&dreq->kref);
 305         kref_get(&dreq->kref);
 306         init_completion(&dreq->completion);
 307         INIT_LIST_HEAD(&dreq->mds_cinfo.list);
 308         dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
 309         INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
 310         spin_lock_init(&dreq->lock);
 311 
 312         return dreq;
 313 }
 314 
 315 static void nfs_direct_req_free(struct kref *kref)
 316 {
 317         struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 318 
 319         nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
 320         if (dreq->l_ctx != NULL)
 321                 nfs_put_lock_context(dreq->l_ctx);
 322         if (dreq->ctx != NULL)
 323                 put_nfs_open_context(dreq->ctx);
 324         kmem_cache_free(nfs_direct_cachep, dreq);
 325 }
 326 
 327 static void nfs_direct_req_release(struct nfs_direct_req *dreq)
 328 {
 329         kref_put(&dreq->kref, nfs_direct_req_free);
 330 }
 331 
 332 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
 333 {
 334         return dreq->bytes_left;
 335 }
 336 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
 337 
 338 /*
 339  * Collects and returns the final error value/byte-count.
 340  */
 341 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
 342 {
 343         ssize_t result = -EIOCBQUEUED;
 344 
 345         /* Async requests don't wait here */
 346         if (dreq->iocb)
 347                 goto out;
 348 
 349         result = wait_for_completion_killable(&dreq->completion);
 350 
 351         if (!result) {
 352                 result = dreq->count;
 353                 WARN_ON_ONCE(dreq->count < 0);
 354         }
 355         if (!result)
 356                 result = dreq->error;
 357 
 358 out:
 359         return (ssize_t) result;
 360 }
 361 
 362 /*
 363  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 364  * the iocb is still valid here if this is a synchronous request.
 365  */
 366 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 367 {
 368         struct inode *inode = dreq->inode;
 369 
 370         inode_dio_end(inode);
 371 
 372         if (dreq->iocb) {
 373                 long res = (long) dreq->error;
 374                 if (dreq->count != 0) {
 375                         res = (long) dreq->count;
 376                         WARN_ON_ONCE(dreq->count < 0);
 377                 }
 378                 dreq->iocb->ki_complete(dreq->iocb, res, 0);
 379         }
 380 
 381         complete(&dreq->completion);
 382 
 383         nfs_direct_req_release(dreq);
 384 }
 385 
 386 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 387 {
 388         unsigned long bytes = 0;
 389         struct nfs_direct_req *dreq = hdr->dreq;
 390 
 391         spin_lock(&dreq->lock);
 392         if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 393                 spin_unlock(&dreq->lock);
 394                 goto out_put;
 395         }
 396 
 397         nfs_direct_count_bytes(dreq, hdr);
 398         spin_unlock(&dreq->lock);
 399 
 400         while (!list_empty(&hdr->pages)) {
 401                 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 402                 struct page *page = req->wb_page;
 403 
 404                 if (!PageCompound(page) && bytes < hdr->good_bytes &&
 405                     (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
 406                         set_page_dirty(page);
 407                 bytes += req->wb_bytes;
 408                 nfs_list_remove_request(req);
 409                 nfs_release_request(req);
 410         }
 411 out_put:
 412         if (put_dreq(dreq))
 413                 nfs_direct_complete(dreq);
 414         hdr->release(hdr);
 415 }
 416 
 417 static void nfs_read_sync_pgio_error(struct list_head *head, int error)
 418 {
 419         struct nfs_page *req;
 420 
 421         while (!list_empty(head)) {
 422                 req = nfs_list_entry(head->next);
 423                 nfs_list_remove_request(req);
 424                 nfs_release_request(req);
 425         }
 426 }
 427 
 428 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
 429 {
 430         get_dreq(hdr->dreq);
 431 }
 432 
 433 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
 434         .error_cleanup = nfs_read_sync_pgio_error,
 435         .init_hdr = nfs_direct_pgio_init,
 436         .completion = nfs_direct_read_completion,
 437 };
 438 
 439 /*
 440  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
 441  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
 442  * bail and stop sending more reads.  Read length accounting is
 443  * handled automatically by nfs_direct_read_result().  Otherwise, if
 444  * no requests have been sent, just return an error.
 445  */
 446 
 447 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 448                                               struct iov_iter *iter,
 449                                               loff_t pos)
 450 {
 451         struct nfs_pageio_descriptor desc;
 452         struct inode *inode = dreq->inode;
 453         ssize_t result = -EINVAL;
 454         size_t requested_bytes = 0;
 455         size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
 456 
 457         nfs_pageio_init_read(&desc, dreq->inode, false,
 458                              &nfs_direct_read_completion_ops);
 459         get_dreq(dreq);
 460         desc.pg_dreq = dreq;
 461         inode_dio_begin(inode);
 462 
 463         while (iov_iter_count(iter)) {
 464                 struct page **pagevec;
 465                 size_t bytes;
 466                 size_t pgbase;
 467                 unsigned npages, i;
 468 
 469                 result = iov_iter_get_pages_alloc(iter, &pagevec, 
 470                                                   rsize, &pgbase);
 471                 if (result < 0)
 472                         break;
 473         
 474                 bytes = result;
 475                 iov_iter_advance(iter, bytes);
 476                 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 477                 for (i = 0; i < npages; i++) {
 478                         struct nfs_page *req;
 479                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 480                         /* XXX do we need to do the eof zeroing found in async_filler? */
 481                         req = nfs_create_request(dreq->ctx, pagevec[i],
 482                                                  pgbase, req_len);
 483                         if (IS_ERR(req)) {
 484                                 result = PTR_ERR(req);
 485                                 break;
 486                         }
 487                         req->wb_index = pos >> PAGE_SHIFT;
 488                         req->wb_offset = pos & ~PAGE_MASK;
 489                         if (!nfs_pageio_add_request(&desc, req)) {
 490                                 result = desc.pg_error;
 491                                 nfs_release_request(req);
 492                                 break;
 493                         }
 494                         pgbase = 0;
 495                         bytes -= req_len;
 496                         requested_bytes += req_len;
 497                         pos += req_len;
 498                         dreq->bytes_left -= req_len;
 499                 }
 500                 nfs_direct_release_pages(pagevec, npages);
 501                 kvfree(pagevec);
 502                 if (result < 0)
 503                         break;
 504         }
 505 
 506         nfs_pageio_complete(&desc);
 507 
 508         /*
 509          * If no bytes were started, return the error, and let the
 510          * generic layer handle the completion.
 511          */
 512         if (requested_bytes == 0) {
 513                 inode_dio_end(inode);
 514                 nfs_direct_req_release(dreq);
 515                 return result < 0 ? result : -EIO;
 516         }
 517 
 518         if (put_dreq(dreq))
 519                 nfs_direct_complete(dreq);
 520         return requested_bytes;
 521 }
 522 
 523 /**
 524  * nfs_file_direct_read - file direct read operation for NFS files
 525  * @iocb: target I/O control block
 526  * @iter: vector of user buffers into which to read data
 527  *
 528  * We use this function for direct reads instead of calling
 529  * generic_file_aio_read() in order to avoid gfar's check to see if
 530  * the request starts before the end of the file.  For that check
 531  * to work, we must generate a GETATTR before each direct read, and
 532  * even then there is a window between the GETATTR and the subsequent
 533  * READ where the file size could change.  Our preference is simply
 534  * to do all reads the application wants, and the server will take
 535  * care of managing the end of file boundary.
 536  *
 537  * This function also eliminates unnecessarily updating the file's
 538  * atime locally, as the NFS server sets the file's atime, and this
 539  * client must read the updated atime from the server back into its
 540  * cache.
 541  */
 542 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 543 {
 544         struct file *file = iocb->ki_filp;
 545         struct address_space *mapping = file->f_mapping;
 546         struct inode *inode = mapping->host;
 547         struct nfs_direct_req *dreq;
 548         struct nfs_lock_context *l_ctx;
 549         ssize_t result = -EINVAL, requested;
 550         size_t count = iov_iter_count(iter);
 551         nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 552 
 553         dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
 554                 file, count, (long long) iocb->ki_pos);
 555 
 556         result = 0;
 557         if (!count)
 558                 goto out;
 559 
 560         task_io_account_read(count);
 561 
 562         result = -ENOMEM;
 563         dreq = nfs_direct_req_alloc();
 564         if (dreq == NULL)
 565                 goto out;
 566 
 567         dreq->inode = inode;
 568         dreq->bytes_left = dreq->max_count = count;
 569         dreq->io_start = iocb->ki_pos;
 570         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 571         l_ctx = nfs_get_lock_context(dreq->ctx);
 572         if (IS_ERR(l_ctx)) {
 573                 result = PTR_ERR(l_ctx);
 574                 nfs_direct_req_release(dreq);
 575                 goto out_release;
 576         }
 577         dreq->l_ctx = l_ctx;
 578         if (!is_sync_kiocb(iocb))
 579                 dreq->iocb = iocb;
 580 
 581         if (iter_is_iovec(iter))
 582                 dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 583 
 584         nfs_start_io_direct(inode);
 585 
 586         NFS_I(inode)->read_io += count;
 587         requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 588 
 589         nfs_end_io_direct(inode);
 590 
 591         if (requested > 0) {
 592                 result = nfs_direct_wait(dreq);
 593                 if (result > 0) {
 594                         requested -= result;
 595                         iocb->ki_pos += result;
 596                 }
 597                 iov_iter_revert(iter, requested);
 598         } else {
 599                 result = requested;
 600         }
 601 
 602 out_release:
 603         nfs_direct_req_release(dreq);
 604 out:
 605         return result;
 606 }
 607 
 608 static void
 609 nfs_direct_write_scan_commit_list(struct inode *inode,
 610                                   struct list_head *list,
 611                                   struct nfs_commit_info *cinfo)
 612 {
 613         mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 614 #ifdef CONFIG_NFS_V4_1
 615         if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
 616                 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 617 #endif
 618         nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
 619         mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 620 }
 621 
 622 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 623 {
 624         struct nfs_pageio_descriptor desc;
 625         struct nfs_page *req, *tmp;
 626         LIST_HEAD(reqs);
 627         struct nfs_commit_info cinfo;
 628         LIST_HEAD(failed);
 629 
 630         nfs_init_cinfo_from_dreq(&cinfo, dreq);
 631         nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 632 
 633         dreq->count = 0;
 634         dreq->max_count = 0;
 635         list_for_each_entry(req, &reqs, wb_list)
 636                 dreq->max_count += req->wb_bytes;
 637         dreq->verf.committed = NFS_INVALID_STABLE_HOW;
 638         nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
 639         get_dreq(dreq);
 640 
 641         nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
 642                               &nfs_direct_write_completion_ops);
 643         desc.pg_dreq = dreq;
 644 
 645         list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
 646                 /* Bump the transmission count */
 647                 req->wb_nio++;
 648                 if (!nfs_pageio_add_request(&desc, req)) {
 649                         nfs_list_move_request(req, &failed);
 650                         spin_lock(&cinfo.inode->i_lock);
 651                         dreq->flags = 0;
 652                         if (desc.pg_error < 0)
 653                                 dreq->error = desc.pg_error;
 654                         else
 655                                 dreq->error = -EIO;
 656                         spin_unlock(&cinfo.inode->i_lock);
 657                 }
 658                 nfs_release_request(req);
 659         }
 660         nfs_pageio_complete(&desc);
 661 
 662         while (!list_empty(&failed)) {
 663                 req = nfs_list_entry(failed.next);
 664                 nfs_list_remove_request(req);
 665                 nfs_unlock_and_release_request(req);
 666         }
 667 
 668         if (put_dreq(dreq))
 669                 nfs_direct_write_complete(dreq);
 670 }
 671 
 672 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 673 {
 674         struct nfs_direct_req *dreq = data->dreq;
 675         struct nfs_commit_info cinfo;
 676         struct nfs_page *req;
 677         int status = data->task.tk_status;
 678 
 679         nfs_init_cinfo_from_dreq(&cinfo, dreq);
 680         if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
 681                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 682 
 683         while (!list_empty(&data->pages)) {
 684                 req = nfs_list_entry(data->pages.next);
 685                 nfs_list_remove_request(req);
 686                 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
 687                         /*
 688                          * Despite the reboot, the write was successful,
 689                          * so reset wb_nio.
 690                          */
 691                         req->wb_nio = 0;
 692                         /* Note the rewrite will go through mds */
 693                         nfs_mark_request_commit(req, NULL, &cinfo, 0);
 694                 } else
 695                         nfs_release_request(req);
 696                 nfs_unlock_and_release_request(req);
 697         }
 698 
 699         if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
 700                 nfs_direct_write_complete(dreq);
 701 }
 702 
 703 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
 704                 struct nfs_page *req)
 705 {
 706         struct nfs_direct_req *dreq = cinfo->dreq;
 707 
 708         spin_lock(&dreq->lock);
 709         dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 710         spin_unlock(&dreq->lock);
 711         nfs_mark_request_commit(req, NULL, cinfo, 0);
 712 }
 713 
 714 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
 715         .completion = nfs_direct_commit_complete,
 716         .resched_write = nfs_direct_resched_write,
 717 };
 718 
 719 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 720 {
 721         int res;
 722         struct nfs_commit_info cinfo;
 723         LIST_HEAD(mds_list);
 724 
 725         nfs_init_cinfo_from_dreq(&cinfo, dreq);
 726         nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
 727         res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
 728         if (res < 0) /* res == -ENOMEM */
 729                 nfs_direct_write_reschedule(dreq);
 730 }
 731 
 732 static void nfs_direct_write_schedule_work(struct work_struct *work)
 733 {
 734         struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
 735         int flags = dreq->flags;
 736 
 737         dreq->flags = 0;
 738         switch (flags) {
 739                 case NFS_ODIRECT_DO_COMMIT:
 740                         nfs_direct_commit_schedule(dreq);
 741                         break;
 742                 case NFS_ODIRECT_RESCHED_WRITES:
 743                         nfs_direct_write_reschedule(dreq);
 744                         break;
 745                 default:
 746                         nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
 747                         nfs_direct_complete(dreq);
 748         }
 749 }
 750 
 751 static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
 752 {
 753         queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
 754 }
 755 
 756 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 757 {
 758         struct nfs_direct_req *dreq = hdr->dreq;
 759         struct nfs_commit_info cinfo;
 760         bool request_commit = false;
 761         struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 762 
 763         nfs_init_cinfo_from_dreq(&cinfo, dreq);
 764 
 765         spin_lock(&dreq->lock);
 766         if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 767                 spin_unlock(&dreq->lock);
 768                 goto out_put;
 769         }
 770 
 771         nfs_direct_count_bytes(dreq, hdr);
 772         if (hdr->good_bytes != 0) {
 773                 if (nfs_write_need_commit(hdr)) {
 774                         if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 775                                 request_commit = true;
 776                         else if (dreq->flags == 0) {
 777                                 nfs_direct_set_hdr_verf(dreq, hdr);
 778                                 request_commit = true;
 779                                 dreq->flags = NFS_ODIRECT_DO_COMMIT;
 780                         } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
 781                                 request_commit = true;
 782                                 if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
 783                                         dreq->flags =
 784                                                 NFS_ODIRECT_RESCHED_WRITES;
 785                         }
 786                 }
 787         }
 788         spin_unlock(&dreq->lock);
 789 
 790         while (!list_empty(&hdr->pages)) {
 791 
 792                 req = nfs_list_entry(hdr->pages.next);
 793                 nfs_list_remove_request(req);
 794                 if (request_commit) {
 795                         kref_get(&req->wb_kref);
 796                         nfs_mark_request_commit(req, hdr->lseg, &cinfo,
 797                                 hdr->ds_commit_idx);
 798                 }
 799                 nfs_unlock_and_release_request(req);
 800         }
 801 
 802 out_put:
 803         if (put_dreq(dreq))
 804                 nfs_direct_write_complete(dreq);
 805         hdr->release(hdr);
 806 }
 807 
 808 static void nfs_write_sync_pgio_error(struct list_head *head, int error)
 809 {
 810         struct nfs_page *req;
 811 
 812         while (!list_empty(head)) {
 813                 req = nfs_list_entry(head->next);
 814                 nfs_list_remove_request(req);
 815                 nfs_unlock_and_release_request(req);
 816         }
 817 }
 818 
 819 static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
 820 {
 821         struct nfs_direct_req *dreq = hdr->dreq;
 822 
 823         spin_lock(&dreq->lock);
 824         if (dreq->error == 0) {
 825                 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 826                 /* fake unstable write to let common nfs resend pages */
 827                 hdr->verf.committed = NFS_UNSTABLE;
 828                 hdr->good_bytes = hdr->args.count;
 829         }
 830         spin_unlock(&dreq->lock);
 831 }
 832 
 833 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
 834         .error_cleanup = nfs_write_sync_pgio_error,
 835         .init_hdr = nfs_direct_pgio_init,
 836         .completion = nfs_direct_write_completion,
 837         .reschedule_io = nfs_direct_write_reschedule_io,
 838 };
 839 
 840 
 841 /*
 842  * NB: Return the value of the first error return code.  Subsequent
 843  *     errors after the first one are ignored.
 844  */
 845 /*
 846  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
 847  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
 848  * bail and stop sending more writes.  Write length accounting is
 849  * handled automatically by nfs_direct_write_result().  Otherwise, if
 850  * no requests have been sent, just return an error.
 851  */
 852 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 853                                                struct iov_iter *iter,
 854                                                loff_t pos)
 855 {
 856         struct nfs_pageio_descriptor desc;
 857         struct inode *inode = dreq->inode;
 858         ssize_t result = 0;
 859         size_t requested_bytes = 0;
 860         size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
 861 
 862         nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
 863                               &nfs_direct_write_completion_ops);
 864         desc.pg_dreq = dreq;
 865         get_dreq(dreq);
 866         inode_dio_begin(inode);
 867 
 868         NFS_I(inode)->write_io += iov_iter_count(iter);
 869         while (iov_iter_count(iter)) {
 870                 struct page **pagevec;
 871                 size_t bytes;
 872                 size_t pgbase;
 873                 unsigned npages, i;
 874 
 875                 result = iov_iter_get_pages_alloc(iter, &pagevec, 
 876                                                   wsize, &pgbase);
 877                 if (result < 0)
 878                         break;
 879 
 880                 bytes = result;
 881                 iov_iter_advance(iter, bytes);
 882                 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 883                 for (i = 0; i < npages; i++) {
 884                         struct nfs_page *req;
 885                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 886 
 887                         req = nfs_create_request(dreq->ctx, pagevec[i],
 888                                                  pgbase, req_len);
 889                         if (IS_ERR(req)) {
 890                                 result = PTR_ERR(req);
 891                                 break;
 892                         }
 893 
 894                         if (desc.pg_error < 0) {
 895                                 nfs_free_request(req);
 896                                 result = desc.pg_error;
 897                                 break;
 898                         }
 899 
 900                         nfs_lock_request(req);
 901                         req->wb_index = pos >> PAGE_SHIFT;
 902                         req->wb_offset = pos & ~PAGE_MASK;
 903                         if (!nfs_pageio_add_request(&desc, req)) {
 904                                 result = desc.pg_error;
 905                                 nfs_unlock_and_release_request(req);
 906                                 break;
 907                         }
 908                         pgbase = 0;
 909                         bytes -= req_len;
 910                         requested_bytes += req_len;
 911                         pos += req_len;
 912                         dreq->bytes_left -= req_len;
 913                 }
 914                 nfs_direct_release_pages(pagevec, npages);
 915                 kvfree(pagevec);
 916                 if (result < 0)
 917                         break;
 918         }
 919         nfs_pageio_complete(&desc);
 920 
 921         /*
 922          * If no bytes were started, return the error, and let the
 923          * generic layer handle the completion.
 924          */
 925         if (requested_bytes == 0) {
 926                 inode_dio_end(inode);
 927                 nfs_direct_req_release(dreq);
 928                 return result < 0 ? result : -EIO;
 929         }
 930 
 931         if (put_dreq(dreq))
 932                 nfs_direct_write_complete(dreq);
 933         return requested_bytes;
 934 }
 935 
 936 /**
 937  * nfs_file_direct_write - file direct write operation for NFS files
 938  * @iocb: target I/O control block
 939  * @iter: vector of user buffers from which to write data
 940  *
 941  * We use this function for direct writes instead of calling
 942  * generic_file_aio_write() in order to avoid taking the inode
 943  * semaphore and updating the i_size.  The NFS server will set
 944  * the new i_size and this client must read the updated size
 945  * back into its cache.  We let the server do generic write
 946  * parameter checking and report problems.
 947  *
 948  * We eliminate local atime updates, see direct read above.
 949  *
 950  * We avoid unnecessary page cache invalidations for normal cached
 951  * readers of this file.
 952  *
 953  * Note that O_APPEND is not supported for NFS direct writes, as there
 954  * is no atomic O_APPEND write facility in the NFS protocol.
 955  */
 956 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 957 {
 958         ssize_t result = -EINVAL, requested;
 959         size_t count;
 960         struct file *file = iocb->ki_filp;
 961         struct address_space *mapping = file->f_mapping;
 962         struct inode *inode = mapping->host;
 963         struct nfs_direct_req *dreq;
 964         struct nfs_lock_context *l_ctx;
 965         loff_t pos, end;
 966 
 967         dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 968                 file, iov_iter_count(iter), (long long) iocb->ki_pos);
 969 
 970         result = generic_write_checks(iocb, iter);
 971         if (result <= 0)
 972                 return result;
 973         count = result;
 974         nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 975 
 976         pos = iocb->ki_pos;
 977         end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
 978 
 979         task_io_account_write(count);
 980 
 981         result = -ENOMEM;
 982         dreq = nfs_direct_req_alloc();
 983         if (!dreq)
 984                 goto out;
 985 
 986         dreq->inode = inode;
 987         dreq->bytes_left = dreq->max_count = count;
 988         dreq->io_start = pos;
 989         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 990         l_ctx = nfs_get_lock_context(dreq->ctx);
 991         if (IS_ERR(l_ctx)) {
 992                 result = PTR_ERR(l_ctx);
 993                 nfs_direct_req_release(dreq);
 994                 goto out_release;
 995         }
 996         dreq->l_ctx = l_ctx;
 997         if (!is_sync_kiocb(iocb))
 998                 dreq->iocb = iocb;
 999 
1000         nfs_start_io_direct(inode);
1001 
1002         requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1003 
1004         if (mapping->nrpages) {
1005                 invalidate_inode_pages2_range(mapping,
1006                                               pos >> PAGE_SHIFT, end);
1007         }
1008 
1009         nfs_end_io_direct(inode);
1010 
1011         if (requested > 0) {
1012                 result = nfs_direct_wait(dreq);
1013                 if (result > 0) {
1014                         requested -= result;
1015                         iocb->ki_pos = pos + result;
1016                         /* XXX: should check the generic_write_sync retval */
1017                         generic_write_sync(iocb, result);
1018                 }
1019                 iov_iter_revert(iter, requested);
1020         } else {
1021                 result = requested;
1022         }
1023 out_release:
1024         nfs_direct_req_release(dreq);
1025 out:
1026         return result;
1027 }
1028 
1029 /**
1030  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1031  *
1032  */
1033 int __init nfs_init_directcache(void)
1034 {
1035         nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1036                                                 sizeof(struct nfs_direct_req),
1037                                                 0, (SLAB_RECLAIM_ACCOUNT|
1038                                                         SLAB_MEM_SPREAD),
1039                                                 NULL);
1040         if (nfs_direct_cachep == NULL)
1041                 return -ENOMEM;
1042 
1043         return 0;
1044 }
1045 
1046 /**
1047  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1048  *
1049  */
1050 void nfs_destroy_directcache(void)
1051 {
1052         kmem_cache_destroy(nfs_direct_cachep);
1053 }

/* [<][>][^][v][top][bottom][index][help] */