1/* 2 * linux/fs/ext4/page-io.c 3 * 4 * This contains the new page_io functions for ext4 5 * 6 * Written by Theodore Ts'o, 2010. 7 */ 8 9#include <linux/fs.h> 10#include <linux/time.h> 11#include <linux/highuid.h> 12#include <linux/pagemap.h> 13#include <linux/quotaops.h> 14#include <linux/string.h> 15#include <linux/buffer_head.h> 16#include <linux/writeback.h> 17#include <linux/pagevec.h> 18#include <linux/mpage.h> 19#include <linux/namei.h> 20#include <linux/uio.h> 21#include <linux/bio.h> 22#include <linux/workqueue.h> 23#include <linux/kernel.h> 24#include <linux/slab.h> 25#include <linux/mm.h> 26 27#include "ext4_jbd2.h" 28#include "xattr.h" 29#include "acl.h" 30 31static struct kmem_cache *io_end_cachep; 32 33int __init ext4_init_pageio(void) 34{ 35 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 36 if (io_end_cachep == NULL) 37 return -ENOMEM; 38 return 0; 39} 40 41void ext4_exit_pageio(void) 42{ 43 kmem_cache_destroy(io_end_cachep); 44} 45 46/* 47 * Print an buffer I/O error compatible with the fs/buffer.c. This 48 * provides compatibility with dmesg scrapers that look for a specific 49 * buffer I/O error message. We really need a unified error reporting 50 * structure to userspace ala Digital Unix's uerf system, but it's 51 * probably not going to happen in my lifetime, due to LKML politics... 52 */ 53static void buffer_io_error(struct buffer_head *bh) 54{ 55 char b[BDEVNAME_SIZE]; 56 printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 57 bdevname(bh->b_bdev, b), 58 (unsigned long long)bh->b_blocknr); 59} 60 61static void ext4_finish_bio(struct bio *bio) 62{ 63 int i; 64 struct bio_vec *bvec; 65 66 bio_for_each_segment_all(bvec, bio, i) { 67 struct page *page = bvec->bv_page; 68#ifdef CONFIG_EXT4_FS_ENCRYPTION 69 struct page *data_page = NULL; 70 struct ext4_crypto_ctx *ctx = NULL; 71#endif 72 struct buffer_head *bh, *head; 73 unsigned bio_start = bvec->bv_offset; 74 unsigned bio_end = bio_start + bvec->bv_len; 75 unsigned under_io = 0; 76 unsigned long flags; 77 78 if (!page) 79 continue; 80 81#ifdef CONFIG_EXT4_FS_ENCRYPTION 82 if (!page->mapping) { 83 /* The bounce data pages are unmapped. */ 84 data_page = page; 85 ctx = (struct ext4_crypto_ctx *)page_private(data_page); 86 page = ctx->w.control_page; 87 } 88#endif 89 90 if (bio->bi_error) { 91 SetPageError(page); 92 set_bit(AS_EIO, &page->mapping->flags); 93 } 94 bh = head = page_buffers(page); 95 /* 96 * We check all buffers in the page under BH_Uptodate_Lock 97 * to avoid races with other end io clearing async_write flags 98 */ 99 local_irq_save(flags); 100 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 101 do { 102 if (bh_offset(bh) < bio_start || 103 bh_offset(bh) + bh->b_size > bio_end) { 104 if (buffer_async_write(bh)) 105 under_io++; 106 continue; 107 } 108 clear_buffer_async_write(bh); 109 if (bio->bi_error) 110 buffer_io_error(bh); 111 } while ((bh = bh->b_this_page) != head); 112 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 113 local_irq_restore(flags); 114 if (!under_io) { 115#ifdef CONFIG_EXT4_FS_ENCRYPTION 116 if (ctx) 117 ext4_restore_control_page(data_page); 118#endif 119 end_page_writeback(page); 120 } 121 } 122} 123 124static void ext4_release_io_end(ext4_io_end_t *io_end) 125{ 126 struct bio *bio, *next_bio; 127 128 BUG_ON(!list_empty(&io_end->list)); 129 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 130 WARN_ON(io_end->handle); 131 132 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) 133 wake_up_all(ext4_ioend_wq(io_end->inode)); 134 135 for (bio = io_end->bio; bio; bio = next_bio) { 136 next_bio = bio->bi_private; 137 ext4_finish_bio(bio); 138 bio_put(bio); 139 } 140 kmem_cache_free(io_end_cachep, io_end); 141} 142 143static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) 144{ 145 struct inode *inode = io_end->inode; 146 147 io_end->flag &= ~EXT4_IO_END_UNWRITTEN; 148 /* Wake up anyone waiting on unwritten extent conversion */ 149 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 150 wake_up_all(ext4_ioend_wq(inode)); 151} 152 153/* 154 * Check a range of space and convert unwritten extents to written. Note that 155 * we are protected from truncate touching same part of extent tree by the 156 * fact that truncate code waits for all DIO to finish (thus exclusion from 157 * direct IO is achieved) and also waits for PageWriteback bits. Thus we 158 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are 159 * completed (happens from ext4_free_ioend()). 160 */ 161static int ext4_end_io(ext4_io_end_t *io) 162{ 163 struct inode *inode = io->inode; 164 loff_t offset = io->offset; 165 ssize_t size = io->size; 166 handle_t *handle = io->handle; 167 int ret = 0; 168 169 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 170 "list->prev 0x%p\n", 171 io, inode->i_ino, io->list.next, io->list.prev); 172 173 io->handle = NULL; /* Following call will use up the handle */ 174 ret = ext4_convert_unwritten_extents(handle, inode, offset, size); 175 if (ret < 0) { 176 ext4_msg(inode->i_sb, KERN_EMERG, 177 "failed to convert unwritten extents to written " 178 "extents -- potential data loss! " 179 "(inode %lu, offset %llu, size %zd, error %d)", 180 inode->i_ino, offset, size, ret); 181 } 182 ext4_clear_io_unwritten_flag(io); 183 ext4_release_io_end(io); 184 return ret; 185} 186 187static void dump_completed_IO(struct inode *inode, struct list_head *head) 188{ 189#ifdef EXT4FS_DEBUG 190 struct list_head *cur, *before, *after; 191 ext4_io_end_t *io, *io0, *io1; 192 193 if (list_empty(head)) 194 return; 195 196 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); 197 list_for_each_entry(io, head, list) { 198 cur = &io->list; 199 before = cur->prev; 200 io0 = container_of(before, ext4_io_end_t, list); 201 after = cur->next; 202 io1 = container_of(after, ext4_io_end_t, list); 203 204 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 205 io, inode->i_ino, io0, io1); 206 } 207#endif 208} 209 210/* Add the io_end to per-inode completed end_io list. */ 211static void ext4_add_complete_io(ext4_io_end_t *io_end) 212{ 213 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 214 struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); 215 struct workqueue_struct *wq; 216 unsigned long flags; 217 218 /* Only reserved conversions from writeback should enter here */ 219 WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 220 WARN_ON(!io_end->handle && sbi->s_journal); 221 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 222 wq = sbi->rsv_conversion_wq; 223 if (list_empty(&ei->i_rsv_conversion_list)) 224 queue_work(wq, &ei->i_rsv_conversion_work); 225 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); 226 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 227} 228 229static int ext4_do_flush_completed_IO(struct inode *inode, 230 struct list_head *head) 231{ 232 ext4_io_end_t *io; 233 struct list_head unwritten; 234 unsigned long flags; 235 struct ext4_inode_info *ei = EXT4_I(inode); 236 int err, ret = 0; 237 238 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 239 dump_completed_IO(inode, head); 240 list_replace_init(head, &unwritten); 241 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 242 243 while (!list_empty(&unwritten)) { 244 io = list_entry(unwritten.next, ext4_io_end_t, list); 245 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 246 list_del_init(&io->list); 247 248 err = ext4_end_io(io); 249 if (unlikely(!ret && err)) 250 ret = err; 251 } 252 return ret; 253} 254 255/* 256 * work on completed IO, to convert unwritten extents to extents 257 */ 258void ext4_end_io_rsv_work(struct work_struct *work) 259{ 260 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 261 i_rsv_conversion_work); 262 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 263} 264 265ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 266{ 267 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 268 if (io) { 269 atomic_inc(&EXT4_I(inode)->i_ioend_count); 270 io->inode = inode; 271 INIT_LIST_HEAD(&io->list); 272 atomic_set(&io->count, 1); 273 } 274 return io; 275} 276 277void ext4_put_io_end_defer(ext4_io_end_t *io_end) 278{ 279 if (atomic_dec_and_test(&io_end->count)) { 280 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 281 ext4_release_io_end(io_end); 282 return; 283 } 284 ext4_add_complete_io(io_end); 285 } 286} 287 288int ext4_put_io_end(ext4_io_end_t *io_end) 289{ 290 int err = 0; 291 292 if (atomic_dec_and_test(&io_end->count)) { 293 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 294 err = ext4_convert_unwritten_extents(io_end->handle, 295 io_end->inode, io_end->offset, 296 io_end->size); 297 io_end->handle = NULL; 298 ext4_clear_io_unwritten_flag(io_end); 299 } 300 ext4_release_io_end(io_end); 301 } 302 return err; 303} 304 305ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 306{ 307 atomic_inc(&io_end->count); 308 return io_end; 309} 310 311/* BIO completion function for page writeback */ 312static void ext4_end_bio(struct bio *bio) 313{ 314 ext4_io_end_t *io_end = bio->bi_private; 315 sector_t bi_sector = bio->bi_iter.bi_sector; 316 317 BUG_ON(!io_end); 318 bio->bi_end_io = NULL; 319 320 if (bio->bi_error) { 321 struct inode *inode = io_end->inode; 322 323 ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " 324 "(offset %llu size %ld starting block %llu)", 325 bio->bi_error, inode->i_ino, 326 (unsigned long long) io_end->offset, 327 (long) io_end->size, 328 (unsigned long long) 329 bi_sector >> (inode->i_blkbits - 9)); 330 mapping_set_error(inode->i_mapping, bio->bi_error); 331 } 332 333 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 334 /* 335 * Link bio into list hanging from io_end. We have to do it 336 * atomically as bio completions can be racing against each 337 * other. 338 */ 339 bio->bi_private = xchg(&io_end->bio, bio); 340 ext4_put_io_end_defer(io_end); 341 } else { 342 /* 343 * Drop io_end reference early. Inode can get freed once 344 * we finish the bio. 345 */ 346 ext4_put_io_end_defer(io_end); 347 ext4_finish_bio(bio); 348 bio_put(bio); 349 } 350} 351 352void ext4_io_submit(struct ext4_io_submit *io) 353{ 354 struct bio *bio = io->io_bio; 355 356 if (bio) { 357 int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? 358 WRITE_SYNC : WRITE; 359 bio_get(io->io_bio); 360 submit_bio(io_op, io->io_bio); 361 bio_put(io->io_bio); 362 } 363 io->io_bio = NULL; 364} 365 366void ext4_io_submit_init(struct ext4_io_submit *io, 367 struct writeback_control *wbc) 368{ 369 io->io_wbc = wbc; 370 io->io_bio = NULL; 371 io->io_end = NULL; 372} 373 374static int io_submit_init_bio(struct ext4_io_submit *io, 375 struct buffer_head *bh) 376{ 377 struct bio *bio; 378 379 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 380 if (!bio) 381 return -ENOMEM; 382 wbc_init_bio(io->io_wbc, bio); 383 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 384 bio->bi_bdev = bh->b_bdev; 385 bio->bi_end_io = ext4_end_bio; 386 bio->bi_private = ext4_get_io_end(io->io_end); 387 io->io_bio = bio; 388 io->io_next_block = bh->b_blocknr; 389 return 0; 390} 391 392static int io_submit_add_bh(struct ext4_io_submit *io, 393 struct inode *inode, 394 struct page *page, 395 struct buffer_head *bh) 396{ 397 int ret; 398 399 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 400submit_and_retry: 401 ext4_io_submit(io); 402 } 403 if (io->io_bio == NULL) { 404 ret = io_submit_init_bio(io, bh); 405 if (ret) 406 return ret; 407 } 408 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 409 if (ret != bh->b_size) 410 goto submit_and_retry; 411 wbc_account_io(io->io_wbc, page, bh->b_size); 412 io->io_next_block++; 413 return 0; 414} 415 416int ext4_bio_write_page(struct ext4_io_submit *io, 417 struct page *page, 418 int len, 419 struct writeback_control *wbc, 420 bool keep_towrite) 421{ 422 struct page *data_page = NULL; 423 struct inode *inode = page->mapping->host; 424 unsigned block_start, blocksize; 425 struct buffer_head *bh, *head; 426 int ret = 0; 427 int nr_submitted = 0; 428 int nr_to_submit = 0; 429 430 blocksize = 1 << inode->i_blkbits; 431 432 BUG_ON(!PageLocked(page)); 433 BUG_ON(PageWriteback(page)); 434 435 if (keep_towrite) 436 set_page_writeback_keepwrite(page); 437 else 438 set_page_writeback(page); 439 ClearPageError(page); 440 441 /* 442 * Comments copied from block_write_full_page: 443 * 444 * The page straddles i_size. It must be zeroed out on each and every 445 * writepage invocation because it may be mmapped. "A file is mapped 446 * in multiples of the page size. For a file that is not a multiple of 447 * the page size, the remaining memory is zeroed when mapped, and 448 * writes to that region are not written out to the file." 449 */ 450 if (len < PAGE_CACHE_SIZE) 451 zero_user_segment(page, len, PAGE_CACHE_SIZE); 452 /* 453 * In the first loop we prepare and mark buffers to submit. We have to 454 * mark all buffers in the page before submitting so that 455 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 456 * on the first buffer finishes and we are still working on submitting 457 * the second buffer. 458 */ 459 bh = head = page_buffers(page); 460 do { 461 block_start = bh_offset(bh); 462 if (block_start >= len) { 463 clear_buffer_dirty(bh); 464 set_buffer_uptodate(bh); 465 continue; 466 } 467 if (!buffer_dirty(bh) || buffer_delay(bh) || 468 !buffer_mapped(bh) || buffer_unwritten(bh)) { 469 /* A hole? We can safely clear the dirty bit */ 470 if (!buffer_mapped(bh)) 471 clear_buffer_dirty(bh); 472 if (io->io_bio) 473 ext4_io_submit(io); 474 continue; 475 } 476 if (buffer_new(bh)) { 477 clear_buffer_new(bh); 478 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 479 } 480 set_buffer_async_write(bh); 481 nr_to_submit++; 482 } while ((bh = bh->b_this_page) != head); 483 484 bh = head = page_buffers(page); 485 486 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && 487 nr_to_submit) { 488 data_page = ext4_encrypt(inode, page); 489 if (IS_ERR(data_page)) { 490 ret = PTR_ERR(data_page); 491 data_page = NULL; 492 goto out; 493 } 494 } 495 496 /* Now submit buffers to write */ 497 do { 498 if (!buffer_async_write(bh)) 499 continue; 500 ret = io_submit_add_bh(io, inode, 501 data_page ? data_page : page, bh); 502 if (ret) { 503 /* 504 * We only get here on ENOMEM. Not much else 505 * we can do but mark the page as dirty, and 506 * better luck next time. 507 */ 508 break; 509 } 510 nr_submitted++; 511 clear_buffer_dirty(bh); 512 } while ((bh = bh->b_this_page) != head); 513 514 /* Error stopped previous loop? Clean up buffers... */ 515 if (ret) { 516 out: 517 if (data_page) 518 ext4_restore_control_page(data_page); 519 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); 520 redirty_page_for_writepage(wbc, page); 521 do { 522 clear_buffer_async_write(bh); 523 bh = bh->b_this_page; 524 } while (bh != head); 525 } 526 unlock_page(page); 527 /* Nothing submitted - we have to end page writeback */ 528 if (!nr_submitted) 529 end_page_writeback(page); 530 return ret; 531} 532