1/*
2 * linux/fs/jbd/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/mm.h>
21#include <linux/pagemap.h>
22#include <linux/bio.h>
23#include <linux/blkdev.h>
24#include <trace/events/jbd.h>
25
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31	BUFFER_TRACE(bh, "");
32	if (uptodate)
33		set_buffer_uptodate(bh);
34	else
35		clear_buffer_uptodate(bh);
36	unlock_buffer(bh);
37}
38
39/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are
41 * not successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list.  Look to
48 * see if we can strip all buffers from the backing page.
49 *
50 * Called under journal->j_list_lock.  The caller provided us with a ref
51 * against the buffer, and we drop that here.
52 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55	struct page *page;
56
57	if (buffer_dirty(bh))
58		goto nope;
59	if (atomic_read(&bh->b_count) != 1)
60		goto nope;
61	page = bh->b_page;
62	if (!page)
63		goto nope;
64	if (page->mapping)
65		goto nope;
66
67	/* OK, it's a truncated page */
68	if (!trylock_page(page))
69		goto nope;
70
71	page_cache_get(page);
72	__brelse(bh);
73	try_to_free_buffers(page);
74	unlock_page(page);
75	page_cache_release(page);
76	return;
77
78nope:
79	__brelse(bh);
80}
81
82/*
83 * Decrement reference counter for data buffer. If it has been marked
84 * 'BH_Freed', release it and the page to which it belongs if possible.
85 */
86static void release_data_buffer(struct buffer_head *bh)
87{
88	if (buffer_freed(bh)) {
89		WARN_ON_ONCE(buffer_dirty(bh));
90		clear_buffer_freed(bh);
91		clear_buffer_mapped(bh);
92		clear_buffer_new(bh);
93		clear_buffer_req(bh);
94		bh->b_bdev = NULL;
95		release_buffer_page(bh);
96	} else
97		put_bh(bh);
98}
99
100/*
101 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
102 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
103 * return 0.  j_list_lock is dropped in this case.
104 */
105static int inverted_lock(journal_t *journal, struct buffer_head *bh)
106{
107	if (!jbd_trylock_bh_state(bh)) {
108		spin_unlock(&journal->j_list_lock);
109		schedule();
110		return 0;
111	}
112	return 1;
113}
114
115/* Done it all: now write the commit record.  We should have
116 * cleaned up our previous buffers by now, so if we are in abort
117 * mode we can now just skip the rest of the journal write
118 * entirely.
119 *
120 * Returns 1 if the journal needs to be aborted or 0 on success
121 */
122static int journal_write_commit_record(journal_t *journal,
123					transaction_t *commit_transaction)
124{
125	struct journal_head *descriptor;
126	struct buffer_head *bh;
127	journal_header_t *header;
128	int ret;
129
130	if (is_journal_aborted(journal))
131		return 0;
132
133	descriptor = journal_get_descriptor_buffer(journal);
134	if (!descriptor)
135		return 1;
136
137	bh = jh2bh(descriptor);
138
139	header = (journal_header_t *)(bh->b_data);
140	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
141	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
142	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
143
144	JBUFFER_TRACE(descriptor, "write commit block");
145	set_buffer_dirty(bh);
146
147	if (journal->j_flags & JFS_BARRIER)
148		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
149	else
150		ret = sync_dirty_buffer(bh);
151
152	put_bh(bh);		/* One for getblk() */
153	journal_put_journal_head(descriptor);
154
155	return (ret == -EIO);
156}
157
158static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
159				   int write_op)
160{
161	int i;
162
163	for (i = 0; i < bufs; i++) {
164		wbuf[i]->b_end_io = end_buffer_write_sync;
165		/*
166		 * Here we write back pagecache data that may be mmaped. Since
167		 * we cannot afford to clean the page and set PageWriteback
168		 * here due to lock ordering (page lock ranks above transaction
169		 * start), the data can change while IO is in flight. Tell the
170		 * block layer it should bounce the bio pages if stable data
171		 * during write is required.
172		 *
173		 * We use up our safety reference in submit_bh().
174		 */
175		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
176	}
177}
178
179/*
180 *  Submit all the data buffers to disk
181 */
182static int journal_submit_data_buffers(journal_t *journal,
183				       transaction_t *commit_transaction,
184				       int write_op)
185{
186	struct journal_head *jh;
187	struct buffer_head *bh;
188	int locked;
189	int bufs = 0;
190	struct buffer_head **wbuf = journal->j_wbuf;
191	int err = 0;
192
193	/*
194	 * Whenever we unlock the journal and sleep, things can get added
195	 * onto ->t_sync_datalist, so we have to keep looping back to
196	 * write_out_data until we *know* that the list is empty.
197	 *
198	 * Cleanup any flushed data buffers from the data list.  Even in
199	 * abort mode, we want to flush this out as soon as possible.
200	 */
201write_out_data:
202	cond_resched();
203	spin_lock(&journal->j_list_lock);
204
205	while (commit_transaction->t_sync_datalist) {
206		jh = commit_transaction->t_sync_datalist;
207		bh = jh2bh(jh);
208		locked = 0;
209
210		/* Get reference just to make sure buffer does not disappear
211		 * when we are forced to drop various locks */
212		get_bh(bh);
213		/* If the buffer is dirty, we need to submit IO and hence
214		 * we need the buffer lock. We try to lock the buffer without
215		 * blocking. If we fail, we need to drop j_list_lock and do
216		 * blocking lock_buffer().
217		 */
218		if (buffer_dirty(bh)) {
219			if (!trylock_buffer(bh)) {
220				BUFFER_TRACE(bh, "needs blocking lock");
221				spin_unlock(&journal->j_list_lock);
222				trace_jbd_do_submit_data(journal,
223						     commit_transaction);
224				/* Write out all data to prevent deadlocks */
225				journal_do_submit_data(wbuf, bufs, write_op);
226				bufs = 0;
227				lock_buffer(bh);
228				spin_lock(&journal->j_list_lock);
229			}
230			locked = 1;
231		}
232		/* We have to get bh_state lock. Again out of order, sigh. */
233		if (!inverted_lock(journal, bh)) {
234			jbd_lock_bh_state(bh);
235			spin_lock(&journal->j_list_lock);
236		}
237		/* Someone already cleaned up the buffer? */
238		if (!buffer_jbd(bh) || bh2jh(bh) != jh
239			|| jh->b_transaction != commit_transaction
240			|| jh->b_jlist != BJ_SyncData) {
241			jbd_unlock_bh_state(bh);
242			if (locked)
243				unlock_buffer(bh);
244			BUFFER_TRACE(bh, "already cleaned up");
245			release_data_buffer(bh);
246			continue;
247		}
248		if (locked && test_clear_buffer_dirty(bh)) {
249			BUFFER_TRACE(bh, "needs writeout, adding to array");
250			wbuf[bufs++] = bh;
251			__journal_file_buffer(jh, commit_transaction,
252						BJ_Locked);
253			jbd_unlock_bh_state(bh);
254			if (bufs == journal->j_wbufsize) {
255				spin_unlock(&journal->j_list_lock);
256				trace_jbd_do_submit_data(journal,
257						     commit_transaction);
258				journal_do_submit_data(wbuf, bufs, write_op);
259				bufs = 0;
260				goto write_out_data;
261			}
262		} else if (!locked && buffer_locked(bh)) {
263			__journal_file_buffer(jh, commit_transaction,
264						BJ_Locked);
265			jbd_unlock_bh_state(bh);
266			put_bh(bh);
267		} else {
268			BUFFER_TRACE(bh, "writeout complete: unfile");
269			if (unlikely(!buffer_uptodate(bh)))
270				err = -EIO;
271			__journal_unfile_buffer(jh);
272			jbd_unlock_bh_state(bh);
273			if (locked)
274				unlock_buffer(bh);
275			release_data_buffer(bh);
276		}
277
278		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
279			spin_unlock(&journal->j_list_lock);
280			goto write_out_data;
281		}
282	}
283	spin_unlock(&journal->j_list_lock);
284	trace_jbd_do_submit_data(journal, commit_transaction);
285	journal_do_submit_data(wbuf, bufs, write_op);
286
287	return err;
288}
289
290/*
291 * journal_commit_transaction
292 *
293 * The primary function for committing a transaction to the log.  This
294 * function is called by the journal thread to begin a complete commit.
295 */
296void journal_commit_transaction(journal_t *journal)
297{
298	transaction_t *commit_transaction;
299	struct journal_head *jh, *new_jh, *descriptor;
300	struct buffer_head **wbuf = journal->j_wbuf;
301	int bufs;
302	int flags;
303	int err;
304	unsigned int blocknr;
305	ktime_t start_time;
306	u64 commit_time;
307	char *tagp = NULL;
308	journal_header_t *header;
309	journal_block_tag_t *tag = NULL;
310	int space_left = 0;
311	int first_tag = 0;
312	int tag_flag;
313	int i;
314	struct blk_plug plug;
315	int write_op = WRITE;
316
317	/*
318	 * First job: lock down the current transaction and wait for
319	 * all outstanding updates to complete.
320	 */
321
322	/* Do we need to erase the effects of a prior journal_flush? */
323	if (journal->j_flags & JFS_FLUSHED) {
324		jbd_debug(3, "super block updated\n");
325		mutex_lock(&journal->j_checkpoint_mutex);
326		/*
327		 * We hold j_checkpoint_mutex so tail cannot change under us.
328		 * We don't need any special data guarantees for writing sb
329		 * since journal is empty and it is ok for write to be
330		 * flushed only with transaction commit.
331		 */
332		journal_update_sb_log_tail(journal, journal->j_tail_sequence,
333					   journal->j_tail, WRITE_SYNC);
334		mutex_unlock(&journal->j_checkpoint_mutex);
335	} else {
336		jbd_debug(3, "superblock not updated\n");
337	}
338
339	J_ASSERT(journal->j_running_transaction != NULL);
340	J_ASSERT(journal->j_committing_transaction == NULL);
341
342	commit_transaction = journal->j_running_transaction;
343
344	trace_jbd_start_commit(journal, commit_transaction);
345	jbd_debug(1, "JBD: starting commit of transaction %d\n",
346			commit_transaction->t_tid);
347
348	spin_lock(&journal->j_state_lock);
349	J_ASSERT(commit_transaction->t_state == T_RUNNING);
350	commit_transaction->t_state = T_LOCKED;
351
352	trace_jbd_commit_locking(journal, commit_transaction);
353	spin_lock(&commit_transaction->t_handle_lock);
354	while (commit_transaction->t_updates) {
355		DEFINE_WAIT(wait);
356
357		prepare_to_wait(&journal->j_wait_updates, &wait,
358					TASK_UNINTERRUPTIBLE);
359		if (commit_transaction->t_updates) {
360			spin_unlock(&commit_transaction->t_handle_lock);
361			spin_unlock(&journal->j_state_lock);
362			schedule();
363			spin_lock(&journal->j_state_lock);
364			spin_lock(&commit_transaction->t_handle_lock);
365		}
366		finish_wait(&journal->j_wait_updates, &wait);
367	}
368	spin_unlock(&commit_transaction->t_handle_lock);
369
370	J_ASSERT (commit_transaction->t_outstanding_credits <=
371			journal->j_max_transaction_buffers);
372
373	/*
374	 * First thing we are allowed to do is to discard any remaining
375	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
376	 * that there are no such buffers: if a large filesystem
377	 * operation like a truncate needs to split itself over multiple
378	 * transactions, then it may try to do a journal_restart() while
379	 * there are still BJ_Reserved buffers outstanding.  These must
380	 * be released cleanly from the current transaction.
381	 *
382	 * In this case, the filesystem must still reserve write access
383	 * again before modifying the buffer in the new transaction, but
384	 * we do not require it to remember exactly which old buffers it
385	 * has reserved.  This is consistent with the existing behaviour
386	 * that multiple journal_get_write_access() calls to the same
387	 * buffer are perfectly permissible.
388	 */
389	while (commit_transaction->t_reserved_list) {
390		jh = commit_transaction->t_reserved_list;
391		JBUFFER_TRACE(jh, "reserved, unused: refile");
392		/*
393		 * A journal_get_undo_access()+journal_release_buffer() may
394		 * leave undo-committed data.
395		 */
396		if (jh->b_committed_data) {
397			struct buffer_head *bh = jh2bh(jh);
398
399			jbd_lock_bh_state(bh);
400			jbd_free(jh->b_committed_data, bh->b_size);
401			jh->b_committed_data = NULL;
402			jbd_unlock_bh_state(bh);
403		}
404		journal_refile_buffer(journal, jh);
405	}
406
407	/*
408	 * Now try to drop any written-back buffers from the journal's
409	 * checkpoint lists.  We do this *before* commit because it potentially
410	 * frees some memory
411	 */
412	spin_lock(&journal->j_list_lock);
413	__journal_clean_checkpoint_list(journal);
414	spin_unlock(&journal->j_list_lock);
415
416	jbd_debug (3, "JBD: commit phase 1\n");
417
418	/*
419	 * Clear revoked flag to reflect there is no revoked buffers
420	 * in the next transaction which is going to be started.
421	 */
422	journal_clear_buffer_revoked_flags(journal);
423
424	/*
425	 * Switch to a new revoke table.
426	 */
427	journal_switch_revoke_table(journal);
428
429	trace_jbd_commit_flushing(journal, commit_transaction);
430	commit_transaction->t_state = T_FLUSH;
431	journal->j_committing_transaction = commit_transaction;
432	journal->j_running_transaction = NULL;
433	start_time = ktime_get();
434	commit_transaction->t_log_start = journal->j_head;
435	wake_up(&journal->j_wait_transaction_locked);
436	spin_unlock(&journal->j_state_lock);
437
438	jbd_debug (3, "JBD: commit phase 2\n");
439
440	if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
441		write_op = WRITE_SYNC;
442
443	/*
444	 * Now start flushing things to disk, in the order they appear
445	 * on the transaction lists.  Data blocks go first.
446	 */
447	blk_start_plug(&plug);
448	err = journal_submit_data_buffers(journal, commit_transaction,
449					  write_op);
450	blk_finish_plug(&plug);
451
452	/*
453	 * Wait for all previously submitted IO to complete.
454	 */
455	spin_lock(&journal->j_list_lock);
456	while (commit_transaction->t_locked_list) {
457		struct buffer_head *bh;
458
459		jh = commit_transaction->t_locked_list->b_tprev;
460		bh = jh2bh(jh);
461		get_bh(bh);
462		if (buffer_locked(bh)) {
463			spin_unlock(&journal->j_list_lock);
464			wait_on_buffer(bh);
465			spin_lock(&journal->j_list_lock);
466		}
467		if (unlikely(!buffer_uptodate(bh))) {
468			if (!trylock_page(bh->b_page)) {
469				spin_unlock(&journal->j_list_lock);
470				lock_page(bh->b_page);
471				spin_lock(&journal->j_list_lock);
472			}
473			if (bh->b_page->mapping)
474				set_bit(AS_EIO, &bh->b_page->mapping->flags);
475
476			unlock_page(bh->b_page);
477			SetPageError(bh->b_page);
478			err = -EIO;
479		}
480		if (!inverted_lock(journal, bh)) {
481			put_bh(bh);
482			spin_lock(&journal->j_list_lock);
483			continue;
484		}
485		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
486		    jh->b_transaction == commit_transaction &&
487		    jh->b_jlist == BJ_Locked)
488			__journal_unfile_buffer(jh);
489		jbd_unlock_bh_state(bh);
490		release_data_buffer(bh);
491		cond_resched_lock(&journal->j_list_lock);
492	}
493	spin_unlock(&journal->j_list_lock);
494
495	if (err) {
496		char b[BDEVNAME_SIZE];
497
498		printk(KERN_WARNING
499			"JBD: Detected IO errors while flushing file data "
500			"on %s\n", bdevname(journal->j_fs_dev, b));
501		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
502			journal_abort(journal, err);
503		err = 0;
504	}
505
506	blk_start_plug(&plug);
507
508	journal_write_revoke_records(journal, commit_transaction, write_op);
509
510	/*
511	 * If we found any dirty or locked buffers, then we should have
512	 * looped back up to the write_out_data label.  If there weren't
513	 * any then journal_clean_data_list should have wiped the list
514	 * clean by now, so check that it is in fact empty.
515	 */
516	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
517
518	jbd_debug (3, "JBD: commit phase 3\n");
519
520	/*
521	 * Way to go: we have now written out all of the data for a
522	 * transaction!  Now comes the tricky part: we need to write out
523	 * metadata.  Loop over the transaction's entire buffer list:
524	 */
525	spin_lock(&journal->j_state_lock);
526	commit_transaction->t_state = T_COMMIT;
527	spin_unlock(&journal->j_state_lock);
528
529	trace_jbd_commit_logging(journal, commit_transaction);
530	J_ASSERT(commit_transaction->t_nr_buffers <=
531		 commit_transaction->t_outstanding_credits);
532
533	descriptor = NULL;
534	bufs = 0;
535	while (commit_transaction->t_buffers) {
536
537		/* Find the next buffer to be journaled... */
538
539		jh = commit_transaction->t_buffers;
540
541		/* If we're in abort mode, we just un-journal the buffer and
542		   release it. */
543
544		if (is_journal_aborted(journal)) {
545			clear_buffer_jbddirty(jh2bh(jh));
546			JBUFFER_TRACE(jh, "journal is aborting: refile");
547			journal_refile_buffer(journal, jh);
548			/* If that was the last one, we need to clean up
549			 * any descriptor buffers which may have been
550			 * already allocated, even if we are now
551			 * aborting. */
552			if (!commit_transaction->t_buffers)
553				goto start_journal_io;
554			continue;
555		}
556
557		/* Make sure we have a descriptor block in which to
558		   record the metadata buffer. */
559
560		if (!descriptor) {
561			struct buffer_head *bh;
562
563			J_ASSERT (bufs == 0);
564
565			jbd_debug(4, "JBD: get descriptor\n");
566
567			descriptor = journal_get_descriptor_buffer(journal);
568			if (!descriptor) {
569				journal_abort(journal, -EIO);
570				continue;
571			}
572
573			bh = jh2bh(descriptor);
574			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
575				(unsigned long long)bh->b_blocknr, bh->b_data);
576			header = (journal_header_t *)&bh->b_data[0];
577			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
578			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
579			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
580
581			tagp = &bh->b_data[sizeof(journal_header_t)];
582			space_left = bh->b_size - sizeof(journal_header_t);
583			first_tag = 1;
584			set_buffer_jwrite(bh);
585			set_buffer_dirty(bh);
586			wbuf[bufs++] = bh;
587
588			/* Record it so that we can wait for IO
589                           completion later */
590			BUFFER_TRACE(bh, "ph3: file as descriptor");
591			journal_file_buffer(descriptor, commit_transaction,
592					BJ_LogCtl);
593		}
594
595		/* Where is the buffer to be written? */
596
597		err = journal_next_log_block(journal, &blocknr);
598		/* If the block mapping failed, just abandon the buffer
599		   and repeat this loop: we'll fall into the
600		   refile-on-abort condition above. */
601		if (err) {
602			journal_abort(journal, err);
603			continue;
604		}
605
606		/*
607		 * start_this_handle() uses t_outstanding_credits to determine
608		 * the free space in the log, but this counter is changed
609		 * by journal_next_log_block() also.
610		 */
611		commit_transaction->t_outstanding_credits--;
612
613		/* Bump b_count to prevent truncate from stumbling over
614                   the shadowed buffer!  @@@ This can go if we ever get
615                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
616		get_bh(jh2bh(jh));
617
618		/* Make a temporary IO buffer with which to write it out
619                   (this will requeue both the metadata buffer and the
620                   temporary IO buffer). new_bh goes on BJ_IO*/
621
622		set_buffer_jwrite(jh2bh(jh));
623		/*
624		 * akpm: journal_write_metadata_buffer() sets
625		 * new_bh->b_transaction to commit_transaction.
626		 * We need to clean this up before we release new_bh
627		 * (which is of type BJ_IO)
628		 */
629		JBUFFER_TRACE(jh, "ph3: write metadata");
630		flags = journal_write_metadata_buffer(commit_transaction,
631						      jh, &new_jh, blocknr);
632		set_buffer_jwrite(jh2bh(new_jh));
633		wbuf[bufs++] = jh2bh(new_jh);
634
635		/* Record the new block's tag in the current descriptor
636                   buffer */
637
638		tag_flag = 0;
639		if (flags & 1)
640			tag_flag |= JFS_FLAG_ESCAPE;
641		if (!first_tag)
642			tag_flag |= JFS_FLAG_SAME_UUID;
643
644		tag = (journal_block_tag_t *) tagp;
645		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
646		tag->t_flags = cpu_to_be32(tag_flag);
647		tagp += sizeof(journal_block_tag_t);
648		space_left -= sizeof(journal_block_tag_t);
649
650		if (first_tag) {
651			memcpy (tagp, journal->j_uuid, 16);
652			tagp += 16;
653			space_left -= 16;
654			first_tag = 0;
655		}
656
657		/* If there's no more to do, or if the descriptor is full,
658		   let the IO rip! */
659
660		if (bufs == journal->j_wbufsize ||
661		    commit_transaction->t_buffers == NULL ||
662		    space_left < sizeof(journal_block_tag_t) + 16) {
663
664			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
665
666			/* Write an end-of-descriptor marker before
667                           submitting the IOs.  "tag" still points to
668                           the last tag we set up. */
669
670			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
671
672start_journal_io:
673			for (i = 0; i < bufs; i++) {
674				struct buffer_head *bh = wbuf[i];
675				lock_buffer(bh);
676				clear_buffer_dirty(bh);
677				set_buffer_uptodate(bh);
678				bh->b_end_io = journal_end_buffer_io_sync;
679				/*
680				 * In data=journal mode, here we can end up
681				 * writing pagecache data that might be
682				 * mmapped. Since we can't afford to clean the
683				 * page and set PageWriteback (see the comment
684				 * near the other use of _submit_bh()), the
685				 * data can change while the write is in
686				 * flight.  Tell the block layer to bounce the
687				 * bio pages if stable pages are required.
688				 */
689				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
690			}
691			cond_resched();
692
693			/* Force a new descriptor to be generated next
694                           time round the loop. */
695			descriptor = NULL;
696			bufs = 0;
697		}
698	}
699
700	blk_finish_plug(&plug);
701
702	/* Lo and behold: we have just managed to send a transaction to
703           the log.  Before we can commit it, wait for the IO so far to
704           complete.  Control buffers being written are on the
705           transaction's t_log_list queue, and metadata buffers are on
706           the t_iobuf_list queue.
707
708	   Wait for the buffers in reverse order.  That way we are
709	   less likely to be woken up until all IOs have completed, and
710	   so we incur less scheduling load.
711	*/
712
713	jbd_debug(3, "JBD: commit phase 4\n");
714
715	/*
716	 * akpm: these are BJ_IO, and j_list_lock is not needed.
717	 * See __journal_try_to_free_buffer.
718	 */
719wait_for_iobuf:
720	while (commit_transaction->t_iobuf_list != NULL) {
721		struct buffer_head *bh;
722
723		jh = commit_transaction->t_iobuf_list->b_tprev;
724		bh = jh2bh(jh);
725		if (buffer_locked(bh)) {
726			wait_on_buffer(bh);
727			goto wait_for_iobuf;
728		}
729		if (cond_resched())
730			goto wait_for_iobuf;
731
732		if (unlikely(!buffer_uptodate(bh)))
733			err = -EIO;
734
735		clear_buffer_jwrite(bh);
736
737		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
738		journal_unfile_buffer(journal, jh);
739
740		/*
741		 * ->t_iobuf_list should contain only dummy buffer_heads
742		 * which were created by journal_write_metadata_buffer().
743		 */
744		BUFFER_TRACE(bh, "dumping temporary bh");
745		journal_put_journal_head(jh);
746		__brelse(bh);
747		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
748		free_buffer_head(bh);
749
750		/* We also have to unlock and free the corresponding
751                   shadowed buffer */
752		jh = commit_transaction->t_shadow_list->b_tprev;
753		bh = jh2bh(jh);
754		clear_buffer_jwrite(bh);
755		J_ASSERT_BH(bh, buffer_jbddirty(bh));
756
757		/* The metadata is now released for reuse, but we need
758                   to remember it against this transaction so that when
759                   we finally commit, we can do any checkpointing
760                   required. */
761		JBUFFER_TRACE(jh, "file as BJ_Forget");
762		journal_file_buffer(jh, commit_transaction, BJ_Forget);
763		/*
764		 * Wake up any transactions which were waiting for this
765		 * IO to complete. The barrier must be here so that changes
766		 * by journal_file_buffer() take effect before wake_up_bit()
767		 * does the waitqueue check.
768		 */
769		smp_mb();
770		wake_up_bit(&bh->b_state, BH_Unshadow);
771		JBUFFER_TRACE(jh, "brelse shadowed buffer");
772		__brelse(bh);
773	}
774
775	J_ASSERT (commit_transaction->t_shadow_list == NULL);
776
777	jbd_debug(3, "JBD: commit phase 5\n");
778
779	/* Here we wait for the revoke record and descriptor record buffers */
780 wait_for_ctlbuf:
781	while (commit_transaction->t_log_list != NULL) {
782		struct buffer_head *bh;
783
784		jh = commit_transaction->t_log_list->b_tprev;
785		bh = jh2bh(jh);
786		if (buffer_locked(bh)) {
787			wait_on_buffer(bh);
788			goto wait_for_ctlbuf;
789		}
790		if (cond_resched())
791			goto wait_for_ctlbuf;
792
793		if (unlikely(!buffer_uptodate(bh)))
794			err = -EIO;
795
796		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
797		clear_buffer_jwrite(bh);
798		journal_unfile_buffer(journal, jh);
799		journal_put_journal_head(jh);
800		__brelse(bh);		/* One for getblk */
801		/* AKPM: bforget here */
802	}
803
804	if (err)
805		journal_abort(journal, err);
806
807	jbd_debug(3, "JBD: commit phase 6\n");
808
809	/* All metadata is written, now write commit record and do cleanup */
810	spin_lock(&journal->j_state_lock);
811	J_ASSERT(commit_transaction->t_state == T_COMMIT);
812	commit_transaction->t_state = T_COMMIT_RECORD;
813	spin_unlock(&journal->j_state_lock);
814
815	if (journal_write_commit_record(journal, commit_transaction))
816		err = -EIO;
817
818	if (err)
819		journal_abort(journal, err);
820
821	/* End of a transaction!  Finally, we can do checkpoint
822           processing: any buffers committed as a result of this
823           transaction can be removed from any checkpoint list it was on
824           before. */
825
826	jbd_debug(3, "JBD: commit phase 7\n");
827
828	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
829	J_ASSERT(commit_transaction->t_buffers == NULL);
830	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
831	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
832	J_ASSERT(commit_transaction->t_shadow_list == NULL);
833	J_ASSERT(commit_transaction->t_log_list == NULL);
834
835restart_loop:
836	/*
837	 * As there are other places (journal_unmap_buffer()) adding buffers
838	 * to this list we have to be careful and hold the j_list_lock.
839	 */
840	spin_lock(&journal->j_list_lock);
841	while (commit_transaction->t_forget) {
842		transaction_t *cp_transaction;
843		struct buffer_head *bh;
844		int try_to_free = 0;
845
846		jh = commit_transaction->t_forget;
847		spin_unlock(&journal->j_list_lock);
848		bh = jh2bh(jh);
849		/*
850		 * Get a reference so that bh cannot be freed before we are
851		 * done with it.
852		 */
853		get_bh(bh);
854		jbd_lock_bh_state(bh);
855		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
856			jh->b_transaction == journal->j_running_transaction);
857
858		/*
859		 * If there is undo-protected committed data against
860		 * this buffer, then we can remove it now.  If it is a
861		 * buffer needing such protection, the old frozen_data
862		 * field now points to a committed version of the
863		 * buffer, so rotate that field to the new committed
864		 * data.
865		 *
866		 * Otherwise, we can just throw away the frozen data now.
867		 */
868		if (jh->b_committed_data) {
869			jbd_free(jh->b_committed_data, bh->b_size);
870			jh->b_committed_data = NULL;
871			if (jh->b_frozen_data) {
872				jh->b_committed_data = jh->b_frozen_data;
873				jh->b_frozen_data = NULL;
874			}
875		} else if (jh->b_frozen_data) {
876			jbd_free(jh->b_frozen_data, bh->b_size);
877			jh->b_frozen_data = NULL;
878		}
879
880		spin_lock(&journal->j_list_lock);
881		cp_transaction = jh->b_cp_transaction;
882		if (cp_transaction) {
883			JBUFFER_TRACE(jh, "remove from old cp transaction");
884			__journal_remove_checkpoint(jh);
885		}
886
887		/* Only re-checkpoint the buffer_head if it is marked
888		 * dirty.  If the buffer was added to the BJ_Forget list
889		 * by journal_forget, it may no longer be dirty and
890		 * there's no point in keeping a checkpoint record for
891		 * it. */
892
893		/*
894		 * A buffer which has been freed while still being journaled by
895		 * a previous transaction.
896		 */
897		if (buffer_freed(bh)) {
898			/*
899			 * If the running transaction is the one containing
900			 * "add to orphan" operation (b_next_transaction !=
901			 * NULL), we have to wait for that transaction to
902			 * commit before we can really get rid of the buffer.
903			 * So just clear b_modified to not confuse transaction
904			 * credit accounting and refile the buffer to
905			 * BJ_Forget of the running transaction. If the just
906			 * committed transaction contains "add to orphan"
907			 * operation, we can completely invalidate the buffer
908			 * now. We are rather throughout in that since the
909			 * buffer may be still accessible when blocksize <
910			 * pagesize and it is attached to the last partial
911			 * page.
912			 */
913			jh->b_modified = 0;
914			if (!jh->b_next_transaction) {
915				clear_buffer_freed(bh);
916				clear_buffer_jbddirty(bh);
917				clear_buffer_mapped(bh);
918				clear_buffer_new(bh);
919				clear_buffer_req(bh);
920				bh->b_bdev = NULL;
921			}
922		}
923
924		if (buffer_jbddirty(bh)) {
925			JBUFFER_TRACE(jh, "add to new checkpointing trans");
926			__journal_insert_checkpoint(jh, commit_transaction);
927			if (is_journal_aborted(journal))
928				clear_buffer_jbddirty(bh);
929		} else {
930			J_ASSERT_BH(bh, !buffer_dirty(bh));
931			/*
932			 * The buffer on BJ_Forget list and not jbddirty means
933			 * it has been freed by this transaction and hence it
934			 * could not have been reallocated until this
935			 * transaction has committed. *BUT* it could be
936			 * reallocated once we have written all the data to
937			 * disk and before we process the buffer on BJ_Forget
938			 * list.
939			 */
940			if (!jh->b_next_transaction)
941				try_to_free = 1;
942		}
943		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
944		__journal_refile_buffer(jh);
945		jbd_unlock_bh_state(bh);
946		if (try_to_free)
947			release_buffer_page(bh);
948		else
949			__brelse(bh);
950		cond_resched_lock(&journal->j_list_lock);
951	}
952	spin_unlock(&journal->j_list_lock);
953	/*
954	 * This is a bit sleazy.  We use j_list_lock to protect transition
955	 * of a transaction into T_FINISHED state and calling
956	 * __journal_drop_transaction(). Otherwise we could race with
957	 * other checkpointing code processing the transaction...
958	 */
959	spin_lock(&journal->j_state_lock);
960	spin_lock(&journal->j_list_lock);
961	/*
962	 * Now recheck if some buffers did not get attached to the transaction
963	 * while the lock was dropped...
964	 */
965	if (commit_transaction->t_forget) {
966		spin_unlock(&journal->j_list_lock);
967		spin_unlock(&journal->j_state_lock);
968		goto restart_loop;
969	}
970
971	/* Done with this transaction! */
972
973	jbd_debug(3, "JBD: commit phase 8\n");
974
975	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
976
977	commit_transaction->t_state = T_FINISHED;
978	J_ASSERT(commit_transaction == journal->j_committing_transaction);
979	journal->j_commit_sequence = commit_transaction->t_tid;
980	journal->j_committing_transaction = NULL;
981	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
982
983	/*
984	 * weight the commit time higher than the average time so we don't
985	 * react too strongly to vast changes in commit time
986	 */
987	if (likely(journal->j_average_commit_time))
988		journal->j_average_commit_time = (commit_time*3 +
989				journal->j_average_commit_time) / 4;
990	else
991		journal->j_average_commit_time = commit_time;
992
993	spin_unlock(&journal->j_state_lock);
994
995	if (commit_transaction->t_checkpoint_list == NULL &&
996	    commit_transaction->t_checkpoint_io_list == NULL) {
997		__journal_drop_transaction(journal, commit_transaction);
998	} else {
999		if (journal->j_checkpoint_transactions == NULL) {
1000			journal->j_checkpoint_transactions = commit_transaction;
1001			commit_transaction->t_cpnext = commit_transaction;
1002			commit_transaction->t_cpprev = commit_transaction;
1003		} else {
1004			commit_transaction->t_cpnext =
1005				journal->j_checkpoint_transactions;
1006			commit_transaction->t_cpprev =
1007				commit_transaction->t_cpnext->t_cpprev;
1008			commit_transaction->t_cpnext->t_cpprev =
1009				commit_transaction;
1010			commit_transaction->t_cpprev->t_cpnext =
1011				commit_transaction;
1012		}
1013	}
1014	spin_unlock(&journal->j_list_lock);
1015
1016	trace_jbd_end_commit(journal, commit_transaction);
1017	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1018		  journal->j_commit_sequence, journal->j_tail_sequence);
1019
1020	wake_up(&journal->j_wait_done_commit);
1021}
1022