root/fs/jbd2/commit.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. journal_end_buffer_io_sync
  2. release_buffer_page
  3. jbd2_commit_block_csum_set
  4. journal_submit_commit_record
  5. journal_wait_on_commit_record
  6. journal_submit_inode_data_buffers
  7. journal_submit_data_buffers
  8. journal_finish_inode_data_buffers
  9. jbd2_checksum_data
  10. write_tag_block
  11. jbd2_block_tag_csum_set
  12. jbd2_journal_commit_transaction

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  * linux/fs/jbd2/commit.c
   4  *
   5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6  *
   7  * Copyright 1998 Red Hat corp --- All Rights Reserved
   8  *
   9  * Journal commit routines for the generic filesystem journaling code;
  10  * part of the ext2fs journaling system.
  11  */
  12 
  13 #include <linux/time.h>
  14 #include <linux/fs.h>
  15 #include <linux/jbd2.h>
  16 #include <linux/errno.h>
  17 #include <linux/slab.h>
  18 #include <linux/mm.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/jiffies.h>
  21 #include <linux/crc32.h>
  22 #include <linux/writeback.h>
  23 #include <linux/backing-dev.h>
  24 #include <linux/bio.h>
  25 #include <linux/blkdev.h>
  26 #include <linux/bitops.h>
  27 #include <trace/events/jbd2.h>
  28 
  29 /*
  30  * IO end handler for temporary buffer_heads handling writes to the journal.
  31  */
  32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  33 {
  34         struct buffer_head *orig_bh = bh->b_private;
  35 
  36         BUFFER_TRACE(bh, "");
  37         if (uptodate)
  38                 set_buffer_uptodate(bh);
  39         else
  40                 clear_buffer_uptodate(bh);
  41         if (orig_bh) {
  42                 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
  43                 smp_mb__after_atomic();
  44                 wake_up_bit(&orig_bh->b_state, BH_Shadow);
  45         }
  46         unlock_buffer(bh);
  47 }
  48 
  49 /*
  50  * When an ext4 file is truncated, it is possible that some pages are not
  51  * successfully freed, because they are attached to a committing transaction.
  52  * After the transaction commits, these pages are left on the LRU, with no
  53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  55  * the numbers in /proc/meminfo look odd.
  56  *
  57  * So here, we have a buffer which has just come off the forget list.  Look to
  58  * see if we can strip all buffers from the backing page.
  59  *
  60  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  61  * caller provided us with a ref against the buffer, and we drop that here.
  62  */
  63 static void release_buffer_page(struct buffer_head *bh)
  64 {
  65         struct page *page;
  66 
  67         if (buffer_dirty(bh))
  68                 goto nope;
  69         if (atomic_read(&bh->b_count) != 1)
  70                 goto nope;
  71         page = bh->b_page;
  72         if (!page)
  73                 goto nope;
  74         if (page->mapping)
  75                 goto nope;
  76 
  77         /* OK, it's a truncated page */
  78         if (!trylock_page(page))
  79                 goto nope;
  80 
  81         get_page(page);
  82         __brelse(bh);
  83         try_to_free_buffers(page);
  84         unlock_page(page);
  85         put_page(page);
  86         return;
  87 
  88 nope:
  89         __brelse(bh);
  90 }
  91 
  92 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
  93 {
  94         struct commit_header *h;
  95         __u32 csum;
  96 
  97         if (!jbd2_journal_has_csum_v2or3(j))
  98                 return;
  99 
 100         h = (struct commit_header *)(bh->b_data);
 101         h->h_chksum_type = 0;
 102         h->h_chksum_size = 0;
 103         h->h_chksum[0] = 0;
 104         csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 105         h->h_chksum[0] = cpu_to_be32(csum);
 106 }
 107 
 108 /*
 109  * Done it all: now submit the commit record.  We should have
 110  * cleaned up our previous buffers by now, so if we are in abort
 111  * mode we can now just skip the rest of the journal write
 112  * entirely.
 113  *
 114  * Returns 1 if the journal needs to be aborted or 0 on success
 115  */
 116 static int journal_submit_commit_record(journal_t *journal,
 117                                         transaction_t *commit_transaction,
 118                                         struct buffer_head **cbh,
 119                                         __u32 crc32_sum)
 120 {
 121         struct commit_header *tmp;
 122         struct buffer_head *bh;
 123         int ret;
 124         struct timespec64 now;
 125 
 126         *cbh = NULL;
 127 
 128         if (is_journal_aborted(journal))
 129                 return 0;
 130 
 131         bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
 132                                                 JBD2_COMMIT_BLOCK);
 133         if (!bh)
 134                 return 1;
 135 
 136         tmp = (struct commit_header *)bh->b_data;
 137         ktime_get_coarse_real_ts64(&now);
 138         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 139         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 140 
 141         if (jbd2_has_feature_checksum(journal)) {
 142                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 143                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 144                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 145         }
 146         jbd2_commit_block_csum_set(journal, bh);
 147 
 148         BUFFER_TRACE(bh, "submit commit block");
 149         lock_buffer(bh);
 150         clear_buffer_dirty(bh);
 151         set_buffer_uptodate(bh);
 152         bh->b_end_io = journal_end_buffer_io_sync;
 153 
 154         if (journal->j_flags & JBD2_BARRIER &&
 155             !jbd2_has_feature_async_commit(journal))
 156                 ret = submit_bh(REQ_OP_WRITE,
 157                         REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 158         else
 159                 ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 160 
 161         *cbh = bh;
 162         return ret;
 163 }
 164 
 165 /*
 166  * This function along with journal_submit_commit_record
 167  * allows to write the commit record asynchronously.
 168  */
 169 static int journal_wait_on_commit_record(journal_t *journal,
 170                                          struct buffer_head *bh)
 171 {
 172         int ret = 0;
 173 
 174         clear_buffer_dirty(bh);
 175         wait_on_buffer(bh);
 176 
 177         if (unlikely(!buffer_uptodate(bh)))
 178                 ret = -EIO;
 179         put_bh(bh);            /* One for getblk() */
 180 
 181         return ret;
 182 }
 183 
 184 /*
 185  * write the filemap data using writepage() address_space_operations.
 186  * We don't do block allocation here even for delalloc. We don't
 187  * use writepages() because with delayed allocation we may be doing
 188  * block allocation in writepages().
 189  */
 190 static int journal_submit_inode_data_buffers(struct address_space *mapping,
 191                 loff_t dirty_start, loff_t dirty_end)
 192 {
 193         int ret;
 194         struct writeback_control wbc = {
 195                 .sync_mode =  WB_SYNC_ALL,
 196                 .nr_to_write = mapping->nrpages * 2,
 197                 .range_start = dirty_start,
 198                 .range_end = dirty_end,
 199         };
 200 
 201         ret = generic_writepages(mapping, &wbc);
 202         return ret;
 203 }
 204 
 205 /*
 206  * Submit all the data buffers of inode associated with the transaction to
 207  * disk.
 208  *
 209  * We are in a committing transaction. Therefore no new inode can be added to
 210  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 211  * operate on from being released while we write out pages.
 212  */
 213 static int journal_submit_data_buffers(journal_t *journal,
 214                 transaction_t *commit_transaction)
 215 {
 216         struct jbd2_inode *jinode;
 217         int err, ret = 0;
 218         struct address_space *mapping;
 219 
 220         spin_lock(&journal->j_list_lock);
 221         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 222                 loff_t dirty_start = jinode->i_dirty_start;
 223                 loff_t dirty_end = jinode->i_dirty_end;
 224 
 225                 if (!(jinode->i_flags & JI_WRITE_DATA))
 226                         continue;
 227                 mapping = jinode->i_vfs_inode->i_mapping;
 228                 jinode->i_flags |= JI_COMMIT_RUNNING;
 229                 spin_unlock(&journal->j_list_lock);
 230                 /*
 231                  * submit the inode data buffers. We use writepage
 232                  * instead of writepages. Because writepages can do
 233                  * block allocation  with delalloc. We need to write
 234                  * only allocated blocks here.
 235                  */
 236                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 237                 err = journal_submit_inode_data_buffers(mapping, dirty_start,
 238                                 dirty_end);
 239                 if (!ret)
 240                         ret = err;
 241                 spin_lock(&journal->j_list_lock);
 242                 J_ASSERT(jinode->i_transaction == commit_transaction);
 243                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 244                 smp_mb();
 245                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 246         }
 247         spin_unlock(&journal->j_list_lock);
 248         return ret;
 249 }
 250 
 251 /*
 252  * Wait for data submitted for writeout, refile inodes to proper
 253  * transaction if needed.
 254  *
 255  */
 256 static int journal_finish_inode_data_buffers(journal_t *journal,
 257                 transaction_t *commit_transaction)
 258 {
 259         struct jbd2_inode *jinode, *next_i;
 260         int err, ret = 0;
 261 
 262         /* For locking, see the comment in journal_submit_data_buffers() */
 263         spin_lock(&journal->j_list_lock);
 264         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 265                 loff_t dirty_start = jinode->i_dirty_start;
 266                 loff_t dirty_end = jinode->i_dirty_end;
 267 
 268                 if (!(jinode->i_flags & JI_WAIT_DATA))
 269                         continue;
 270                 jinode->i_flags |= JI_COMMIT_RUNNING;
 271                 spin_unlock(&journal->j_list_lock);
 272                 err = filemap_fdatawait_range_keep_errors(
 273                                 jinode->i_vfs_inode->i_mapping, dirty_start,
 274                                 dirty_end);
 275                 if (!ret)
 276                         ret = err;
 277                 spin_lock(&journal->j_list_lock);
 278                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 279                 smp_mb();
 280                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 281         }
 282 
 283         /* Now refile inode to proper lists */
 284         list_for_each_entry_safe(jinode, next_i,
 285                                  &commit_transaction->t_inode_list, i_list) {
 286                 list_del(&jinode->i_list);
 287                 if (jinode->i_next_transaction) {
 288                         jinode->i_transaction = jinode->i_next_transaction;
 289                         jinode->i_next_transaction = NULL;
 290                         list_add(&jinode->i_list,
 291                                 &jinode->i_transaction->t_inode_list);
 292                 } else {
 293                         jinode->i_transaction = NULL;
 294                         jinode->i_dirty_start = 0;
 295                         jinode->i_dirty_end = 0;
 296                 }
 297         }
 298         spin_unlock(&journal->j_list_lock);
 299 
 300         return ret;
 301 }
 302 
 303 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 304 {
 305         struct page *page = bh->b_page;
 306         char *addr;
 307         __u32 checksum;
 308 
 309         addr = kmap_atomic(page);
 310         checksum = crc32_be(crc32_sum,
 311                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 312         kunmap_atomic(addr);
 313 
 314         return checksum;
 315 }
 316 
 317 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 318                                    unsigned long long block)
 319 {
 320         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 321         if (jbd2_has_feature_64bit(j))
 322                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 323 }
 324 
 325 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 326                                     struct buffer_head *bh, __u32 sequence)
 327 {
 328         journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 329         struct page *page = bh->b_page;
 330         __u8 *addr;
 331         __u32 csum32;
 332         __be32 seq;
 333 
 334         if (!jbd2_journal_has_csum_v2or3(j))
 335                 return;
 336 
 337         seq = cpu_to_be32(sequence);
 338         addr = kmap_atomic(page);
 339         csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 340         csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
 341                              bh->b_size);
 342         kunmap_atomic(addr);
 343 
 344         if (jbd2_has_feature_csum3(j))
 345                 tag3->t_checksum = cpu_to_be32(csum32);
 346         else
 347                 tag->t_checksum = cpu_to_be16(csum32);
 348 }
 349 /*
 350  * jbd2_journal_commit_transaction
 351  *
 352  * The primary function for committing a transaction to the log.  This
 353  * function is called by the journal thread to begin a complete commit.
 354  */
 355 void jbd2_journal_commit_transaction(journal_t *journal)
 356 {
 357         struct transaction_stats_s stats;
 358         transaction_t *commit_transaction;
 359         struct journal_head *jh;
 360         struct buffer_head *descriptor;
 361         struct buffer_head **wbuf = journal->j_wbuf;
 362         int bufs;
 363         int flags;
 364         int err;
 365         unsigned long long blocknr;
 366         ktime_t start_time;
 367         u64 commit_time;
 368         char *tagp = NULL;
 369         journal_block_tag_t *tag = NULL;
 370         int space_left = 0;
 371         int first_tag = 0;
 372         int tag_flag;
 373         int i;
 374         int tag_bytes = journal_tag_bytes(journal);
 375         struct buffer_head *cbh = NULL; /* For transactional checksums */
 376         __u32 crc32_sum = ~0;
 377         struct blk_plug plug;
 378         /* Tail of the journal */
 379         unsigned long first_block;
 380         tid_t first_tid;
 381         int update_tail;
 382         int csum_size = 0;
 383         LIST_HEAD(io_bufs);
 384         LIST_HEAD(log_bufs);
 385 
 386         if (jbd2_journal_has_csum_v2or3(journal))
 387                 csum_size = sizeof(struct jbd2_journal_block_tail);
 388 
 389         /*
 390          * First job: lock down the current transaction and wait for
 391          * all outstanding updates to complete.
 392          */
 393 
 394         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 395         if (journal->j_flags & JBD2_FLUSHED) {
 396                 jbd_debug(3, "super block updated\n");
 397                 mutex_lock_io(&journal->j_checkpoint_mutex);
 398                 /*
 399                  * We hold j_checkpoint_mutex so tail cannot change under us.
 400                  * We don't need any special data guarantees for writing sb
 401                  * since journal is empty and it is ok for write to be
 402                  * flushed only with transaction commit.
 403                  */
 404                 jbd2_journal_update_sb_log_tail(journal,
 405                                                 journal->j_tail_sequence,
 406                                                 journal->j_tail,
 407                                                 REQ_SYNC);
 408                 mutex_unlock(&journal->j_checkpoint_mutex);
 409         } else {
 410                 jbd_debug(3, "superblock not updated\n");
 411         }
 412 
 413         J_ASSERT(journal->j_running_transaction != NULL);
 414         J_ASSERT(journal->j_committing_transaction == NULL);
 415 
 416         commit_transaction = journal->j_running_transaction;
 417 
 418         trace_jbd2_start_commit(journal, commit_transaction);
 419         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 420                         commit_transaction->t_tid);
 421 
 422         write_lock(&journal->j_state_lock);
 423         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 424         commit_transaction->t_state = T_LOCKED;
 425 
 426         trace_jbd2_commit_locking(journal, commit_transaction);
 427         stats.run.rs_wait = commit_transaction->t_max_wait;
 428         stats.run.rs_request_delay = 0;
 429         stats.run.rs_locked = jiffies;
 430         if (commit_transaction->t_requested)
 431                 stats.run.rs_request_delay =
 432                         jbd2_time_diff(commit_transaction->t_requested,
 433                                        stats.run.rs_locked);
 434         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 435                                               stats.run.rs_locked);
 436 
 437         spin_lock(&commit_transaction->t_handle_lock);
 438         while (atomic_read(&commit_transaction->t_updates)) {
 439                 DEFINE_WAIT(wait);
 440 
 441                 prepare_to_wait(&journal->j_wait_updates, &wait,
 442                                         TASK_UNINTERRUPTIBLE);
 443                 if (atomic_read(&commit_transaction->t_updates)) {
 444                         spin_unlock(&commit_transaction->t_handle_lock);
 445                         write_unlock(&journal->j_state_lock);
 446                         schedule();
 447                         write_lock(&journal->j_state_lock);
 448                         spin_lock(&commit_transaction->t_handle_lock);
 449                 }
 450                 finish_wait(&journal->j_wait_updates, &wait);
 451         }
 452         spin_unlock(&commit_transaction->t_handle_lock);
 453         commit_transaction->t_state = T_SWITCH;
 454         write_unlock(&journal->j_state_lock);
 455 
 456         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 457                         journal->j_max_transaction_buffers);
 458 
 459         /*
 460          * First thing we are allowed to do is to discard any remaining
 461          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 462          * that there are no such buffers: if a large filesystem
 463          * operation like a truncate needs to split itself over multiple
 464          * transactions, then it may try to do a jbd2_journal_restart() while
 465          * there are still BJ_Reserved buffers outstanding.  These must
 466          * be released cleanly from the current transaction.
 467          *
 468          * In this case, the filesystem must still reserve write access
 469          * again before modifying the buffer in the new transaction, but
 470          * we do not require it to remember exactly which old buffers it
 471          * has reserved.  This is consistent with the existing behaviour
 472          * that multiple jbd2_journal_get_write_access() calls to the same
 473          * buffer are perfectly permissible.
 474          */
 475         while (commit_transaction->t_reserved_list) {
 476                 jh = commit_transaction->t_reserved_list;
 477                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 478                 /*
 479                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 480                  * leave undo-committed data.
 481                  */
 482                 if (jh->b_committed_data) {
 483                         struct buffer_head *bh = jh2bh(jh);
 484 
 485                         jbd_lock_bh_state(bh);
 486                         jbd2_free(jh->b_committed_data, bh->b_size);
 487                         jh->b_committed_data = NULL;
 488                         jbd_unlock_bh_state(bh);
 489                 }
 490                 jbd2_journal_refile_buffer(journal, jh);
 491         }
 492 
 493         /*
 494          * Now try to drop any written-back buffers from the journal's
 495          * checkpoint lists.  We do this *before* commit because it potentially
 496          * frees some memory
 497          */
 498         spin_lock(&journal->j_list_lock);
 499         __jbd2_journal_clean_checkpoint_list(journal, false);
 500         spin_unlock(&journal->j_list_lock);
 501 
 502         jbd_debug(3, "JBD2: commit phase 1\n");
 503 
 504         /*
 505          * Clear revoked flag to reflect there is no revoked buffers
 506          * in the next transaction which is going to be started.
 507          */
 508         jbd2_clear_buffer_revoked_flags(journal);
 509 
 510         /*
 511          * Switch to a new revoke table.
 512          */
 513         jbd2_journal_switch_revoke_table(journal);
 514 
 515         /*
 516          * Reserved credits cannot be claimed anymore, free them
 517          */
 518         atomic_sub(atomic_read(&journal->j_reserved_credits),
 519                    &commit_transaction->t_outstanding_credits);
 520 
 521         write_lock(&journal->j_state_lock);
 522         trace_jbd2_commit_flushing(journal, commit_transaction);
 523         stats.run.rs_flushing = jiffies;
 524         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 525                                              stats.run.rs_flushing);
 526 
 527         commit_transaction->t_state = T_FLUSH;
 528         journal->j_committing_transaction = commit_transaction;
 529         journal->j_running_transaction = NULL;
 530         start_time = ktime_get();
 531         commit_transaction->t_log_start = journal->j_head;
 532         wake_up(&journal->j_wait_transaction_locked);
 533         write_unlock(&journal->j_state_lock);
 534 
 535         jbd_debug(3, "JBD2: commit phase 2a\n");
 536 
 537         /*
 538          * Now start flushing things to disk, in the order they appear
 539          * on the transaction lists.  Data blocks go first.
 540          */
 541         err = journal_submit_data_buffers(journal, commit_transaction);
 542         if (err)
 543                 jbd2_journal_abort(journal, err);
 544 
 545         blk_start_plug(&plug);
 546         jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 547 
 548         jbd_debug(3, "JBD2: commit phase 2b\n");
 549 
 550         /*
 551          * Way to go: we have now written out all of the data for a
 552          * transaction!  Now comes the tricky part: we need to write out
 553          * metadata.  Loop over the transaction's entire buffer list:
 554          */
 555         write_lock(&journal->j_state_lock);
 556         commit_transaction->t_state = T_COMMIT;
 557         write_unlock(&journal->j_state_lock);
 558 
 559         trace_jbd2_commit_logging(journal, commit_transaction);
 560         stats.run.rs_logging = jiffies;
 561         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 562                                                stats.run.rs_logging);
 563         stats.run.rs_blocks =
 564                 atomic_read(&commit_transaction->t_outstanding_credits);
 565         stats.run.rs_blocks_logged = 0;
 566 
 567         J_ASSERT(commit_transaction->t_nr_buffers <=
 568                  atomic_read(&commit_transaction->t_outstanding_credits));
 569 
 570         err = 0;
 571         bufs = 0;
 572         descriptor = NULL;
 573         while (commit_transaction->t_buffers) {
 574 
 575                 /* Find the next buffer to be journaled... */
 576 
 577                 jh = commit_transaction->t_buffers;
 578 
 579                 /* If we're in abort mode, we just un-journal the buffer and
 580                    release it. */
 581 
 582                 if (is_journal_aborted(journal)) {
 583                         clear_buffer_jbddirty(jh2bh(jh));
 584                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 585                         jbd2_buffer_abort_trigger(jh,
 586                                                   jh->b_frozen_data ?
 587                                                   jh->b_frozen_triggers :
 588                                                   jh->b_triggers);
 589                         jbd2_journal_refile_buffer(journal, jh);
 590                         /* If that was the last one, we need to clean up
 591                          * any descriptor buffers which may have been
 592                          * already allocated, even if we are now
 593                          * aborting. */
 594                         if (!commit_transaction->t_buffers)
 595                                 goto start_journal_io;
 596                         continue;
 597                 }
 598 
 599                 /* Make sure we have a descriptor block in which to
 600                    record the metadata buffer. */
 601 
 602                 if (!descriptor) {
 603                         J_ASSERT (bufs == 0);
 604 
 605                         jbd_debug(4, "JBD2: get descriptor\n");
 606 
 607                         descriptor = jbd2_journal_get_descriptor_buffer(
 608                                                         commit_transaction,
 609                                                         JBD2_DESCRIPTOR_BLOCK);
 610                         if (!descriptor) {
 611                                 jbd2_journal_abort(journal, -EIO);
 612                                 continue;
 613                         }
 614 
 615                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 616                                 (unsigned long long)descriptor->b_blocknr,
 617                                 descriptor->b_data);
 618                         tagp = &descriptor->b_data[sizeof(journal_header_t)];
 619                         space_left = descriptor->b_size -
 620                                                 sizeof(journal_header_t);
 621                         first_tag = 1;
 622                         set_buffer_jwrite(descriptor);
 623                         set_buffer_dirty(descriptor);
 624                         wbuf[bufs++] = descriptor;
 625 
 626                         /* Record it so that we can wait for IO
 627                            completion later */
 628                         BUFFER_TRACE(descriptor, "ph3: file as descriptor");
 629                         jbd2_file_log_bh(&log_bufs, descriptor);
 630                 }
 631 
 632                 /* Where is the buffer to be written? */
 633 
 634                 err = jbd2_journal_next_log_block(journal, &blocknr);
 635                 /* If the block mapping failed, just abandon the buffer
 636                    and repeat this loop: we'll fall into the
 637                    refile-on-abort condition above. */
 638                 if (err) {
 639                         jbd2_journal_abort(journal, err);
 640                         continue;
 641                 }
 642 
 643                 /*
 644                  * start_this_handle() uses t_outstanding_credits to determine
 645                  * the free space in the log, but this counter is changed
 646                  * by jbd2_journal_next_log_block() also.
 647                  */
 648                 atomic_dec(&commit_transaction->t_outstanding_credits);
 649 
 650                 /* Bump b_count to prevent truncate from stumbling over
 651                    the shadowed buffer!  @@@ This can go if we ever get
 652                    rid of the shadow pairing of buffers. */
 653                 atomic_inc(&jh2bh(jh)->b_count);
 654 
 655                 /*
 656                  * Make a temporary IO buffer with which to write it out
 657                  * (this will requeue the metadata buffer to BJ_Shadow).
 658                  */
 659                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 660                 JBUFFER_TRACE(jh, "ph3: write metadata");
 661                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 662                                                 jh, &wbuf[bufs], blocknr);
 663                 if (flags < 0) {
 664                         jbd2_journal_abort(journal, flags);
 665                         continue;
 666                 }
 667                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 668 
 669                 /* Record the new block's tag in the current descriptor
 670                    buffer */
 671 
 672                 tag_flag = 0;
 673                 if (flags & 1)
 674                         tag_flag |= JBD2_FLAG_ESCAPE;
 675                 if (!first_tag)
 676                         tag_flag |= JBD2_FLAG_SAME_UUID;
 677 
 678                 tag = (journal_block_tag_t *) tagp;
 679                 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
 680                 tag->t_flags = cpu_to_be16(tag_flag);
 681                 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 682                                         commit_transaction->t_tid);
 683                 tagp += tag_bytes;
 684                 space_left -= tag_bytes;
 685                 bufs++;
 686 
 687                 if (first_tag) {
 688                         memcpy (tagp, journal->j_uuid, 16);
 689                         tagp += 16;
 690                         space_left -= 16;
 691                         first_tag = 0;
 692                 }
 693 
 694                 /* If there's no more to do, or if the descriptor is full,
 695                    let the IO rip! */
 696 
 697                 if (bufs == journal->j_wbufsize ||
 698                     commit_transaction->t_buffers == NULL ||
 699                     space_left < tag_bytes + 16 + csum_size) {
 700 
 701                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 702 
 703                         /* Write an end-of-descriptor marker before
 704                            submitting the IOs.  "tag" still points to
 705                            the last tag we set up. */
 706 
 707                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 708 start_journal_io:
 709                         if (descriptor)
 710                                 jbd2_descriptor_block_csum_set(journal,
 711                                                         descriptor);
 712 
 713                         for (i = 0; i < bufs; i++) {
 714                                 struct buffer_head *bh = wbuf[i];
 715                                 /*
 716                                  * Compute checksum.
 717                                  */
 718                                 if (jbd2_has_feature_checksum(journal)) {
 719                                         crc32_sum =
 720                                             jbd2_checksum_data(crc32_sum, bh);
 721                                 }
 722 
 723                                 lock_buffer(bh);
 724                                 clear_buffer_dirty(bh);
 725                                 set_buffer_uptodate(bh);
 726                                 bh->b_end_io = journal_end_buffer_io_sync;
 727                                 submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 728                         }
 729                         cond_resched();
 730 
 731                         /* Force a new descriptor to be generated next
 732                            time round the loop. */
 733                         descriptor = NULL;
 734                         bufs = 0;
 735                 }
 736         }
 737 
 738         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 739         if (err) {
 740                 printk(KERN_WARNING
 741                         "JBD2: Detected IO errors while flushing file data "
 742                        "on %s\n", journal->j_devname);
 743                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 744                         jbd2_journal_abort(journal, err);
 745                 err = 0;
 746         }
 747 
 748         /*
 749          * Get current oldest transaction in the log before we issue flush
 750          * to the filesystem device. After the flush we can be sure that
 751          * blocks of all older transactions are checkpointed to persistent
 752          * storage and we will be safe to update journal start in the
 753          * superblock with the numbers we get here.
 754          */
 755         update_tail =
 756                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 757 
 758         write_lock(&journal->j_state_lock);
 759         if (update_tail) {
 760                 long freed = first_block - journal->j_tail;
 761 
 762                 if (first_block < journal->j_tail)
 763                         freed += journal->j_last - journal->j_first;
 764                 /* Update tail only if we free significant amount of space */
 765                 if (freed < journal->j_maxlen / 4)
 766                         update_tail = 0;
 767         }
 768         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 769         commit_transaction->t_state = T_COMMIT_DFLUSH;
 770         write_unlock(&journal->j_state_lock);
 771 
 772         /* 
 773          * If the journal is not located on the file system device,
 774          * then we must flush the file system device before we issue
 775          * the commit record
 776          */
 777         if (commit_transaction->t_need_data_flush &&
 778             (journal->j_fs_dev != journal->j_dev) &&
 779             (journal->j_flags & JBD2_BARRIER))
 780                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 781 
 782         /* Done it all: now write the commit record asynchronously. */
 783         if (jbd2_has_feature_async_commit(journal)) {
 784                 err = journal_submit_commit_record(journal, commit_transaction,
 785                                                  &cbh, crc32_sum);
 786                 if (err)
 787                         jbd2_journal_abort(journal, err);
 788         }
 789 
 790         blk_finish_plug(&plug);
 791 
 792         /* Lo and behold: we have just managed to send a transaction to
 793            the log.  Before we can commit it, wait for the IO so far to
 794            complete.  Control buffers being written are on the
 795            transaction's t_log_list queue, and metadata buffers are on
 796            the io_bufs list.
 797 
 798            Wait for the buffers in reverse order.  That way we are
 799            less likely to be woken up until all IOs have completed, and
 800            so we incur less scheduling load.
 801         */
 802 
 803         jbd_debug(3, "JBD2: commit phase 3\n");
 804 
 805         while (!list_empty(&io_bufs)) {
 806                 struct buffer_head *bh = list_entry(io_bufs.prev,
 807                                                     struct buffer_head,
 808                                                     b_assoc_buffers);
 809 
 810                 wait_on_buffer(bh);
 811                 cond_resched();
 812 
 813                 if (unlikely(!buffer_uptodate(bh)))
 814                         err = -EIO;
 815                 jbd2_unfile_log_bh(bh);
 816                 stats.run.rs_blocks_logged++;
 817 
 818                 /*
 819                  * The list contains temporary buffer heads created by
 820                  * jbd2_journal_write_metadata_buffer().
 821                  */
 822                 BUFFER_TRACE(bh, "dumping temporary bh");
 823                 __brelse(bh);
 824                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 825                 free_buffer_head(bh);
 826 
 827                 /* We also have to refile the corresponding shadowed buffer */
 828                 jh = commit_transaction->t_shadow_list->b_tprev;
 829                 bh = jh2bh(jh);
 830                 clear_buffer_jwrite(bh);
 831                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 832                 J_ASSERT_BH(bh, !buffer_shadow(bh));
 833 
 834                 /* The metadata is now released for reuse, but we need
 835                    to remember it against this transaction so that when
 836                    we finally commit, we can do any checkpointing
 837                    required. */
 838                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 839                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 840                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 841                 __brelse(bh);
 842         }
 843 
 844         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 845 
 846         jbd_debug(3, "JBD2: commit phase 4\n");
 847 
 848         /* Here we wait for the revoke record and descriptor record buffers */
 849         while (!list_empty(&log_bufs)) {
 850                 struct buffer_head *bh;
 851 
 852                 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
 853                 wait_on_buffer(bh);
 854                 cond_resched();
 855 
 856                 if (unlikely(!buffer_uptodate(bh)))
 857                         err = -EIO;
 858 
 859                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 860                 clear_buffer_jwrite(bh);
 861                 jbd2_unfile_log_bh(bh);
 862                 stats.run.rs_blocks_logged++;
 863                 __brelse(bh);           /* One for getblk */
 864                 /* AKPM: bforget here */
 865         }
 866 
 867         if (err)
 868                 jbd2_journal_abort(journal, err);
 869 
 870         jbd_debug(3, "JBD2: commit phase 5\n");
 871         write_lock(&journal->j_state_lock);
 872         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 873         commit_transaction->t_state = T_COMMIT_JFLUSH;
 874         write_unlock(&journal->j_state_lock);
 875 
 876         if (!jbd2_has_feature_async_commit(journal)) {
 877                 err = journal_submit_commit_record(journal, commit_transaction,
 878                                                 &cbh, crc32_sum);
 879                 if (err)
 880                         jbd2_journal_abort(journal, err);
 881         }
 882         if (cbh)
 883                 err = journal_wait_on_commit_record(journal, cbh);
 884         stats.run.rs_blocks_logged++;
 885         if (jbd2_has_feature_async_commit(journal) &&
 886             journal->j_flags & JBD2_BARRIER) {
 887                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 888         }
 889 
 890         if (err)
 891                 jbd2_journal_abort(journal, err);
 892 
 893         /*
 894          * Now disk caches for filesystem device are flushed so we are safe to
 895          * erase checkpointed transactions from the log by updating journal
 896          * superblock.
 897          */
 898         if (update_tail)
 899                 jbd2_update_log_tail(journal, first_tid, first_block);
 900 
 901         /* End of a transaction!  Finally, we can do checkpoint
 902            processing: any buffers committed as a result of this
 903            transaction can be removed from any checkpoint list it was on
 904            before. */
 905 
 906         jbd_debug(3, "JBD2: commit phase 6\n");
 907 
 908         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 909         J_ASSERT(commit_transaction->t_buffers == NULL);
 910         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 911         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 912 
 913 restart_loop:
 914         /*
 915          * As there are other places (journal_unmap_buffer()) adding buffers
 916          * to this list we have to be careful and hold the j_list_lock.
 917          */
 918         spin_lock(&journal->j_list_lock);
 919         while (commit_transaction->t_forget) {
 920                 transaction_t *cp_transaction;
 921                 struct buffer_head *bh;
 922                 int try_to_free = 0;
 923 
 924                 jh = commit_transaction->t_forget;
 925                 spin_unlock(&journal->j_list_lock);
 926                 bh = jh2bh(jh);
 927                 /*
 928                  * Get a reference so that bh cannot be freed before we are
 929                  * done with it.
 930                  */
 931                 get_bh(bh);
 932                 jbd_lock_bh_state(bh);
 933                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 934 
 935                 /*
 936                  * If there is undo-protected committed data against
 937                  * this buffer, then we can remove it now.  If it is a
 938                  * buffer needing such protection, the old frozen_data
 939                  * field now points to a committed version of the
 940                  * buffer, so rotate that field to the new committed
 941                  * data.
 942                  *
 943                  * Otherwise, we can just throw away the frozen data now.
 944                  *
 945                  * We also know that the frozen data has already fired
 946                  * its triggers if they exist, so we can clear that too.
 947                  */
 948                 if (jh->b_committed_data) {
 949                         jbd2_free(jh->b_committed_data, bh->b_size);
 950                         jh->b_committed_data = NULL;
 951                         if (jh->b_frozen_data) {
 952                                 jh->b_committed_data = jh->b_frozen_data;
 953                                 jh->b_frozen_data = NULL;
 954                                 jh->b_frozen_triggers = NULL;
 955                         }
 956                 } else if (jh->b_frozen_data) {
 957                         jbd2_free(jh->b_frozen_data, bh->b_size);
 958                         jh->b_frozen_data = NULL;
 959                         jh->b_frozen_triggers = NULL;
 960                 }
 961 
 962                 spin_lock(&journal->j_list_lock);
 963                 cp_transaction = jh->b_cp_transaction;
 964                 if (cp_transaction) {
 965                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 966                         cp_transaction->t_chp_stats.cs_dropped++;
 967                         __jbd2_journal_remove_checkpoint(jh);
 968                 }
 969 
 970                 /* Only re-checkpoint the buffer_head if it is marked
 971                  * dirty.  If the buffer was added to the BJ_Forget list
 972                  * by jbd2_journal_forget, it may no longer be dirty and
 973                  * there's no point in keeping a checkpoint record for
 974                  * it. */
 975 
 976                 /*
 977                  * A buffer which has been freed while still being journaled
 978                  * by a previous transaction, refile the buffer to BJ_Forget of
 979                  * the running transaction. If the just committed transaction
 980                  * contains "add to orphan" operation, we can completely
 981                  * invalidate the buffer now. We are rather through in that
 982                  * since the buffer may be still accessible when blocksize <
 983                  * pagesize and it is attached to the last partial page.
 984                  */
 985                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 986                         struct address_space *mapping;
 987 
 988                         clear_buffer_freed(bh);
 989                         clear_buffer_jbddirty(bh);
 990 
 991                         /*
 992                          * Block device buffers need to stay mapped all the
 993                          * time, so it is enough to clear buffer_jbddirty and
 994                          * buffer_freed bits. For the file mapping buffers (i.e.
 995                          * journalled data) we need to unmap buffer and clear
 996                          * more bits. We also need to be careful about the check
 997                          * because the data page mapping can get cleared under
 998                          * our hands. Note that if mapping == NULL, we don't
 999                          * need to make buffer unmapped because the page is
1000                          * already detached from the mapping and buffers cannot
1001                          * get reused.
1002                          */
1003                         mapping = READ_ONCE(bh->b_page->mapping);
1004                         if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1005                                 clear_buffer_mapped(bh);
1006                                 clear_buffer_new(bh);
1007                                 clear_buffer_req(bh);
1008                                 bh->b_bdev = NULL;
1009                         }
1010                 }
1011 
1012                 if (buffer_jbddirty(bh)) {
1013                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
1014                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1015                         if (is_journal_aborted(journal))
1016                                 clear_buffer_jbddirty(bh);
1017                 } else {
1018                         J_ASSERT_BH(bh, !buffer_dirty(bh));
1019                         /*
1020                          * The buffer on BJ_Forget list and not jbddirty means
1021                          * it has been freed by this transaction and hence it
1022                          * could not have been reallocated until this
1023                          * transaction has committed. *BUT* it could be
1024                          * reallocated once we have written all the data to
1025                          * disk and before we process the buffer on BJ_Forget
1026                          * list.
1027                          */
1028                         if (!jh->b_next_transaction)
1029                                 try_to_free = 1;
1030                 }
1031                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1032                 __jbd2_journal_refile_buffer(jh);
1033                 jbd_unlock_bh_state(bh);
1034                 if (try_to_free)
1035                         release_buffer_page(bh);        /* Drops bh reference */
1036                 else
1037                         __brelse(bh);
1038                 cond_resched_lock(&journal->j_list_lock);
1039         }
1040         spin_unlock(&journal->j_list_lock);
1041         /*
1042          * This is a bit sleazy.  We use j_list_lock to protect transition
1043          * of a transaction into T_FINISHED state and calling
1044          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1045          * other checkpointing code processing the transaction...
1046          */
1047         write_lock(&journal->j_state_lock);
1048         spin_lock(&journal->j_list_lock);
1049         /*
1050          * Now recheck if some buffers did not get attached to the transaction
1051          * while the lock was dropped...
1052          */
1053         if (commit_transaction->t_forget) {
1054                 spin_unlock(&journal->j_list_lock);
1055                 write_unlock(&journal->j_state_lock);
1056                 goto restart_loop;
1057         }
1058 
1059         /* Add the transaction to the checkpoint list
1060          * __journal_remove_checkpoint() can not destroy transaction
1061          * under us because it is not marked as T_FINISHED yet */
1062         if (journal->j_checkpoint_transactions == NULL) {
1063                 journal->j_checkpoint_transactions = commit_transaction;
1064                 commit_transaction->t_cpnext = commit_transaction;
1065                 commit_transaction->t_cpprev = commit_transaction;
1066         } else {
1067                 commit_transaction->t_cpnext =
1068                         journal->j_checkpoint_transactions;
1069                 commit_transaction->t_cpprev =
1070                         commit_transaction->t_cpnext->t_cpprev;
1071                 commit_transaction->t_cpnext->t_cpprev =
1072                         commit_transaction;
1073                 commit_transaction->t_cpprev->t_cpnext =
1074                                 commit_transaction;
1075         }
1076         spin_unlock(&journal->j_list_lock);
1077 
1078         /* Done with this transaction! */
1079 
1080         jbd_debug(3, "JBD2: commit phase 7\n");
1081 
1082         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1083 
1084         commit_transaction->t_start = jiffies;
1085         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1086                                               commit_transaction->t_start);
1087 
1088         /*
1089          * File the transaction statistics
1090          */
1091         stats.ts_tid = commit_transaction->t_tid;
1092         stats.run.rs_handle_count =
1093                 atomic_read(&commit_transaction->t_handle_count);
1094         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1095                              commit_transaction->t_tid, &stats.run);
1096         stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1097 
1098         commit_transaction->t_state = T_COMMIT_CALLBACK;
1099         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1100         journal->j_commit_sequence = commit_transaction->t_tid;
1101         journal->j_committing_transaction = NULL;
1102         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1103 
1104         /*
1105          * weight the commit time higher than the average time so we don't
1106          * react too strongly to vast changes in the commit time
1107          */
1108         if (likely(journal->j_average_commit_time))
1109                 journal->j_average_commit_time = (commit_time +
1110                                 journal->j_average_commit_time*3) / 4;
1111         else
1112                 journal->j_average_commit_time = commit_time;
1113 
1114         write_unlock(&journal->j_state_lock);
1115 
1116         if (journal->j_commit_callback)
1117                 journal->j_commit_callback(journal, commit_transaction);
1118 
1119         trace_jbd2_end_commit(journal, commit_transaction);
1120         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1121                   journal->j_commit_sequence, journal->j_tail_sequence);
1122 
1123         write_lock(&journal->j_state_lock);
1124         spin_lock(&journal->j_list_lock);
1125         commit_transaction->t_state = T_FINISHED;
1126         /* Check if the transaction can be dropped now that we are finished */
1127         if (commit_transaction->t_checkpoint_list == NULL &&
1128             commit_transaction->t_checkpoint_io_list == NULL) {
1129                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1130                 jbd2_journal_free_transaction(commit_transaction);
1131         }
1132         spin_unlock(&journal->j_list_lock);
1133         write_unlock(&journal->j_state_lock);
1134         wake_up(&journal->j_wait_done_commit);
1135 
1136         /*
1137          * Calculate overall stats
1138          */
1139         spin_lock(&journal->j_history_lock);
1140         journal->j_stats.ts_tid++;
1141         journal->j_stats.ts_requested += stats.ts_requested;
1142         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1143         journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1144         journal->j_stats.run.rs_running += stats.run.rs_running;
1145         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1146         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1147         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1148         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1149         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1150         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1151         spin_unlock(&journal->j_history_lock);
1152 }

/* [<][>][^][v][top][bottom][index][help] */