1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_trans.h"
15 #include "xfs_trans_priv.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_inode.h"
18 #include "xfs_inode_item.h"
19 #include "xfs_quota.h"
20 #include "xfs_dquot_item.h"
21 #include "xfs_dquot.h"
22 #include "xfs_trace.h"
24 #include "xfs_log_priv.h"
25 #include "xfs_error.h"
28 struct kmem_cache
*xfs_buf_item_cache
;
30 static inline struct xfs_buf_log_item
*BUF_ITEM(struct xfs_log_item
*lip
)
32 return container_of(lip
, struct xfs_buf_log_item
, bli_item
);
35 /* Is this log iovec plausibly large enough to contain the buffer log format? */
37 xfs_buf_log_check_iovec(
38 struct xfs_log_iovec
*iovec
)
40 struct xfs_buf_log_format
*blfp
= iovec
->i_addr
;
44 if (offsetof(struct xfs_buf_log_format
, blf_data_map
) > iovec
->i_len
)
47 item_end
= (char *)iovec
->i_addr
+ iovec
->i_len
;
48 bmp_end
= (char *)&blfp
->blf_data_map
[blfp
->blf_map_size
];
49 return bmp_end
<= item_end
;
53 xfs_buf_log_format_size(
54 struct xfs_buf_log_format
*blfp
)
56 return offsetof(struct xfs_buf_log_format
, blf_data_map
) +
57 (blfp
->blf_map_size
* sizeof(blfp
->blf_data_map
[0]));
61 xfs_buf_item_straddle(
69 first
= xfs_buf_offset(bp
, offset
+ (first_bit
<< XFS_BLF_SHIFT
));
70 last
= xfs_buf_offset(bp
,
71 offset
+ ((first_bit
+ nbits
) << XFS_BLF_SHIFT
));
73 if (last
- first
!= nbits
* XFS_BLF_CHUNK
)
79 * Return the number of log iovecs and space needed to log the given buf log
82 * It calculates this as 1 iovec for the buf log format structure and 1 for each
83 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
87 xfs_buf_item_size_segment(
88 struct xfs_buf_log_item
*bip
,
89 struct xfs_buf_log_format
*blfp
,
94 struct xfs_buf
*bp
= bip
->bli_buf
;
100 first_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
, 0);
105 *nbytes
+= xfs_buf_log_format_size(blfp
);
108 nbits
= xfs_contig_bits(blfp
->blf_data_map
,
109 blfp
->blf_map_size
, first_bit
);
113 * Straddling a page is rare because we don't log contiguous
114 * chunks of unmapped buffers anywhere.
117 xfs_buf_item_straddle(bp
, offset
, first_bit
, nbits
))
121 *nbytes
+= nbits
* XFS_BLF_CHUNK
;
124 * This takes the bit number to start looking from and
125 * returns the next set bit from there. It returns -1
126 * if there are no more bits set or the start bit is
127 * beyond the end of the bitmap.
129 first_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
,
130 (uint
)first_bit
+ nbits
+ 1);
131 } while (first_bit
!= -1);
136 /* Count the first bit we jumped out of the above loop from */
138 *nbytes
+= XFS_BLF_CHUNK
;
139 last_bit
= first_bit
;
140 while (last_bit
!= -1) {
142 * This takes the bit number to start looking from and
143 * returns the next set bit from there. It returns -1
144 * if there are no more bits set or the start bit is
145 * beyond the end of the bitmap.
147 next_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
,
150 * If we run out of bits, leave the loop,
151 * else if we find a new set of bits bump the number of vecs,
152 * else keep scanning the current set of bits.
154 if (next_bit
== -1) {
156 } else if (next_bit
!= last_bit
+ 1 ||
157 xfs_buf_item_straddle(bp
, offset
, first_bit
, nbits
)) {
159 first_bit
= next_bit
;
166 *nbytes
+= XFS_BLF_CHUNK
;
171 * Return the number of log iovecs and space needed to log the given buf log
174 * Discontiguous buffers need a format structure per region that is being
175 * logged. This makes the changes in the buffer appear to log recovery as though
176 * they came from separate buffers, just like would occur if multiple buffers
177 * were used instead of a single discontiguous buffer. This enables
178 * discontiguous buffers to be in-memory constructs, completely transparent to
179 * what ends up on disk.
181 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
182 * format structures. If the item has previously been logged and has dirty
183 * regions, we do not relog them in stale buffers. This has the effect of
184 * reducing the size of the relogged item by the amount of dirty data tracked
185 * by the log item. This can result in the committing transaction reducing the
186 * amount of space being consumed by the CIL.
190 struct xfs_log_item
*lip
,
194 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
195 struct xfs_buf
*bp
= bip
->bli_buf
;
200 ASSERT(atomic_read(&bip
->bli_refcount
) > 0);
201 if (bip
->bli_flags
& XFS_BLI_STALE
) {
203 * The buffer is stale, so all we need to log is the buf log
204 * format structure with the cancel flag in it as we are never
205 * going to replay the changes tracked in the log item.
207 trace_xfs_buf_item_size_stale(bip
);
208 ASSERT(bip
->__bli_format
.blf_flags
& XFS_BLF_CANCEL
);
209 *nvecs
+= bip
->bli_format_count
;
210 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
211 *nbytes
+= xfs_buf_log_format_size(&bip
->bli_formats
[i
]);
216 ASSERT(bip
->bli_flags
& XFS_BLI_LOGGED
);
218 if (bip
->bli_flags
& XFS_BLI_ORDERED
) {
220 * The buffer has been logged just to order it. It is not being
221 * included in the transaction commit, so no vectors are used at
224 trace_xfs_buf_item_size_ordered(bip
);
225 *nvecs
= XFS_LOG_VEC_ORDERED
;
230 * The vector count is based on the number of buffer vectors we have
231 * dirty bits in. This will only be greater than one when we have a
232 * compound buffer with more than one segment dirty. Hence for compound
233 * buffers we need to track which segment the dirty bits correspond to,
234 * and when we move from one segment to the next increment the vector
235 * count for the extra buf log format structure that will need to be
239 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
240 xfs_buf_item_size_segment(bip
, &bip
->bli_formats
[i
], offset
,
242 offset
+= BBTOB(bp
->b_maps
[i
].bm_len
);
246 * Round up the buffer size required to minimise the number of memory
247 * allocations that need to be done as this item grows when relogged by
248 * repeated modifications.
250 *nbytes
= round_up(bytes
, 512);
251 trace_xfs_buf_item_size(bip
);
255 xfs_buf_item_copy_iovec(
256 struct xfs_log_vec
*lv
,
257 struct xfs_log_iovec
**vecp
,
263 offset
+= first_bit
* XFS_BLF_CHUNK
;
264 xlog_copy_iovec(lv
, vecp
, XLOG_REG_TYPE_BCHUNK
,
265 xfs_buf_offset(bp
, offset
),
266 nbits
* XFS_BLF_CHUNK
);
270 xfs_buf_item_format_segment(
271 struct xfs_buf_log_item
*bip
,
272 struct xfs_log_vec
*lv
,
273 struct xfs_log_iovec
**vecp
,
275 struct xfs_buf_log_format
*blfp
)
277 struct xfs_buf
*bp
= bip
->bli_buf
;
284 /* copy the flags across from the base format item */
285 blfp
->blf_flags
= bip
->__bli_format
.blf_flags
;
288 * Base size is the actual size of the ondisk structure - it reflects
289 * the actual size of the dirty bitmap rather than the size of the in
292 base_size
= xfs_buf_log_format_size(blfp
);
294 first_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
, 0);
295 if (!(bip
->bli_flags
& XFS_BLI_STALE
) && first_bit
== -1) {
297 * If the map is not be dirty in the transaction, mark
298 * the size as zero and do not advance the vector pointer.
303 blfp
= xlog_copy_iovec(lv
, vecp
, XLOG_REG_TYPE_BFORMAT
, blfp
, base_size
);
306 if (bip
->bli_flags
& XFS_BLI_STALE
) {
308 * The buffer is stale, so all we need to log
309 * is the buf log format structure with the
312 trace_xfs_buf_item_format_stale(bip
);
313 ASSERT(blfp
->blf_flags
& XFS_BLF_CANCEL
);
319 * Fill in an iovec for each set of contiguous chunks.
322 ASSERT(first_bit
>= 0);
323 nbits
= xfs_contig_bits(blfp
->blf_data_map
,
324 blfp
->blf_map_size
, first_bit
);
328 * Straddling a page is rare because we don't log contiguous
329 * chunks of unmapped buffers anywhere.
332 xfs_buf_item_straddle(bp
, offset
, first_bit
, nbits
))
335 xfs_buf_item_copy_iovec(lv
, vecp
, bp
, offset
,
340 * This takes the bit number to start looking from and
341 * returns the next set bit from there. It returns -1
342 * if there are no more bits set or the start bit is
343 * beyond the end of the bitmap.
345 first_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
,
346 (uint
)first_bit
+ nbits
+ 1);
347 } while (first_bit
!= -1);
352 ASSERT(bp
->b_addr
== NULL
);
353 last_bit
= first_bit
;
357 * This takes the bit number to start looking from and
358 * returns the next set bit from there. It returns -1
359 * if there are no more bits set or the start bit is
360 * beyond the end of the bitmap.
362 next_bit
= xfs_next_bit(blfp
->blf_data_map
, blfp
->blf_map_size
,
365 * If we run out of bits fill in the last iovec and get out of
366 * the loop. Else if we start a new set of bits then fill in
367 * the iovec for the series we were looking at and start
368 * counting the bits in the new one. Else we're still in the
369 * same set of bits so just keep counting and scanning.
371 if (next_bit
== -1) {
372 xfs_buf_item_copy_iovec(lv
, vecp
, bp
, offset
,
376 } else if (next_bit
!= last_bit
+ 1 ||
377 xfs_buf_item_straddle(bp
, offset
, first_bit
, nbits
)) {
378 xfs_buf_item_copy_iovec(lv
, vecp
, bp
, offset
,
381 first_bit
= next_bit
;
392 * This is called to fill in the vector of log iovecs for the
393 * given log buf item. It fills the first entry with a buf log
394 * format structure, and the rest point to contiguous chunks
399 struct xfs_log_item
*lip
,
400 struct xfs_log_vec
*lv
)
402 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
403 struct xfs_buf
*bp
= bip
->bli_buf
;
404 struct xfs_log_iovec
*vecp
= NULL
;
408 ASSERT(atomic_read(&bip
->bli_refcount
) > 0);
409 ASSERT((bip
->bli_flags
& XFS_BLI_LOGGED
) ||
410 (bip
->bli_flags
& XFS_BLI_STALE
));
411 ASSERT((bip
->bli_flags
& XFS_BLI_STALE
) ||
412 (xfs_blft_from_flags(&bip
->__bli_format
) > XFS_BLFT_UNKNOWN_BUF
413 && xfs_blft_from_flags(&bip
->__bli_format
) < XFS_BLFT_MAX_BUF
));
414 ASSERT(!(bip
->bli_flags
& XFS_BLI_ORDERED
) ||
415 (bip
->bli_flags
& XFS_BLI_STALE
));
419 * If it is an inode buffer, transfer the in-memory state to the
420 * format flags and clear the in-memory state.
422 * For buffer based inode allocation, we do not transfer
423 * this state if the inode buffer allocation has not yet been committed
424 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
425 * correct replay of the inode allocation.
427 * For icreate item based inode allocation, the buffers aren't written
428 * to the journal during allocation, and hence we should always tag the
429 * buffer as an inode buffer so that the correct unlinked list replay
430 * occurs during recovery.
432 if (bip
->bli_flags
& XFS_BLI_INODE_BUF
) {
433 if (xfs_has_v3inodes(lip
->li_log
->l_mp
) ||
434 !((bip
->bli_flags
& XFS_BLI_INODE_ALLOC_BUF
) &&
435 xfs_log_item_in_current_chkpt(lip
)))
436 bip
->__bli_format
.blf_flags
|= XFS_BLF_INODE_BUF
;
437 bip
->bli_flags
&= ~XFS_BLI_INODE_BUF
;
440 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
441 xfs_buf_item_format_segment(bip
, lv
, &vecp
, offset
,
442 &bip
->bli_formats
[i
]);
443 offset
+= BBTOB(bp
->b_maps
[i
].bm_len
);
447 * Check to make sure everything is consistent.
449 trace_xfs_buf_item_format(bip
);
453 * This is called to pin the buffer associated with the buf log item in memory
454 * so it cannot be written out.
456 * We take a reference to the buffer log item here so that the BLI life cycle
457 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
458 * inserted into the AIL.
460 * We also need to take a reference to the buffer itself as the BLI unpin
461 * processing requires accessing the buffer after the BLI has dropped the final
462 * BLI reference. See xfs_buf_item_unpin() for an explanation.
463 * If unpins race to drop the final BLI reference and only the
464 * BLI owns a reference to the buffer, then the loser of the race can have the
465 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
466 * pin count ensures the life cycle of the buffer extends for as
467 * long as we hold the buffer pin reference in xfs_buf_item_unpin().
471 struct xfs_log_item
*lip
)
473 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
475 ASSERT(atomic_read(&bip
->bli_refcount
) > 0);
476 ASSERT((bip
->bli_flags
& XFS_BLI_LOGGED
) ||
477 (bip
->bli_flags
& XFS_BLI_ORDERED
) ||
478 (bip
->bli_flags
& XFS_BLI_STALE
));
480 trace_xfs_buf_item_pin(bip
);
482 xfs_buf_hold(bip
->bli_buf
);
483 atomic_inc(&bip
->bli_refcount
);
484 atomic_inc(&bip
->bli_buf
->b_pin_count
);
488 * This is called to unpin the buffer associated with the buf log item which was
489 * previously pinned with a call to xfs_buf_item_pin(). We enter this function
490 * with a buffer pin count, a buffer reference and a BLI reference.
492 * We must drop the BLI reference before we unpin the buffer because the AIL
493 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
494 * refcount drops to zero, the bli could still be AIL resident and the buffer
495 * submitted for I/O at any point before we return. This can result in IO
496 * completion freeing the buffer while we are still trying to access it here.
497 * This race condition can also occur in shutdown situations where we abort and
498 * unpin buffers from contexts other that journal IO completion.
500 * Hence we have to hold a buffer reference per pin count to ensure that the
501 * buffer cannot be freed until we have finished processing the unpin operation.
502 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
503 * are done processing the buffer state. In the case of an abort (remove =
504 * true) then we re-use the current pin reference as the IO reference we hand
505 * off to IO failure handling.
509 struct xfs_log_item
*lip
,
512 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
513 struct xfs_buf
*bp
= bip
->bli_buf
;
514 int stale
= bip
->bli_flags
& XFS_BLI_STALE
;
517 ASSERT(bp
->b_log_item
== bip
);
518 ASSERT(atomic_read(&bip
->bli_refcount
) > 0);
520 trace_xfs_buf_item_unpin(bip
);
522 freed
= atomic_dec_and_test(&bip
->bli_refcount
);
523 if (atomic_dec_and_test(&bp
->b_pin_count
))
524 wake_up_all(&bp
->b_waiters
);
527 * Nothing to do but drop the buffer pin reference if the BLI is
536 ASSERT(bip
->bli_flags
& XFS_BLI_STALE
);
537 ASSERT(xfs_buf_islocked(bp
));
538 ASSERT(bp
->b_flags
& XBF_STALE
);
539 ASSERT(bip
->__bli_format
.blf_flags
& XFS_BLF_CANCEL
);
540 ASSERT(list_empty(&lip
->li_trans
));
541 ASSERT(!bp
->b_transp
);
543 trace_xfs_buf_item_unpin_stale(bip
);
546 * The buffer has been locked and referenced since it was marked
547 * stale so we own both lock and reference exclusively here. We
548 * do not need the pin reference any more, so drop it now so
549 * that we only have one reference to drop once item completion
550 * processing is complete.
555 * If we get called here because of an IO error, we may or may
556 * not have the item on the AIL. xfs_trans_ail_delete() will
557 * take care of that situation. xfs_trans_ail_delete() drops
560 if (bip
->bli_flags
& XFS_BLI_STALE_INODE
) {
561 xfs_buf_item_done(bp
);
562 xfs_buf_inode_iodone(bp
);
563 ASSERT(list_empty(&bp
->b_li_list
));
565 xfs_trans_ail_delete(lip
, SHUTDOWN_LOG_IO_ERROR
);
566 xfs_buf_item_relse(bp
);
567 ASSERT(bp
->b_log_item
== NULL
);
575 * We need to simulate an async IO failures here to ensure that
576 * the correct error completion is run on this buffer. This
577 * requires a reference to the buffer and for the buffer to be
578 * locked. We can safely pass ownership of the pin reference to
579 * the IO to ensure that nothing can free the buffer while we
580 * wait for the lock and then run the IO failure completion.
583 bp
->b_flags
|= XBF_ASYNC
;
584 xfs_buf_ioend_fail(bp
);
589 * BLI has no more active references - it will be moved to the AIL to
590 * manage the remaining BLI/buffer life cycle. There is nothing left for
591 * us to do here so drop the pin reference to the buffer.
598 struct xfs_log_item
*lip
,
599 struct list_head
*buffer_list
)
601 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
602 struct xfs_buf
*bp
= bip
->bli_buf
;
603 uint rval
= XFS_ITEM_SUCCESS
;
605 if (xfs_buf_ispinned(bp
))
606 return XFS_ITEM_PINNED
;
607 if (!xfs_buf_trylock(bp
)) {
609 * If we have just raced with a buffer being pinned and it has
610 * been marked stale, we could end up stalling until someone else
611 * issues a log force to unpin the stale buffer. Check for the
612 * race condition here so xfsaild recognizes the buffer is pinned
613 * and queues a log force to move it along.
615 if (xfs_buf_ispinned(bp
))
616 return XFS_ITEM_PINNED
;
617 return XFS_ITEM_LOCKED
;
620 ASSERT(!(bip
->bli_flags
& XFS_BLI_STALE
));
622 trace_xfs_buf_item_push(bip
);
624 /* has a previous flush failed due to IO errors? */
625 if (bp
->b_flags
& XBF_WRITE_FAIL
) {
626 xfs_buf_alert_ratelimited(bp
, "XFS: Failing async write",
627 "Failing async write on buffer block 0x%llx. Retrying async write.",
628 (long long)xfs_buf_daddr(bp
));
631 if (!xfs_buf_delwri_queue(bp
, buffer_list
))
632 rval
= XFS_ITEM_FLUSHING
;
638 * Drop the buffer log item refcount and take appropriate action. This helper
639 * determines whether the bli must be freed or not, since a decrement to zero
640 * does not necessarily mean the bli is unused.
642 * Return true if the bli is freed, false otherwise.
646 struct xfs_buf_log_item
*bip
)
648 struct xfs_log_item
*lip
= &bip
->bli_item
;
652 /* drop the bli ref and return if it wasn't the last one */
653 if (!atomic_dec_and_test(&bip
->bli_refcount
))
657 * We dropped the last ref and must free the item if clean or aborted.
658 * If the bli is dirty and non-aborted, the buffer was clean in the
659 * transaction but still awaiting writeback from previous changes. In
660 * that case, the bli is freed on buffer writeback completion.
662 aborted
= test_bit(XFS_LI_ABORTED
, &lip
->li_flags
) ||
663 xlog_is_shutdown(lip
->li_log
);
664 dirty
= bip
->bli_flags
& XFS_BLI_DIRTY
;
665 if (dirty
&& !aborted
)
669 * The bli is aborted or clean. An aborted item may be in the AIL
670 * regardless of dirty state. For example, consider an aborted
671 * transaction that invalidated a dirty bli and cleared the dirty
675 xfs_trans_ail_delete(lip
, 0);
676 xfs_buf_item_relse(bip
->bli_buf
);
681 * Release the buffer associated with the buf log item. If there is no dirty
682 * logged data associated with the buffer recorded in the buf log item, then
683 * free the buf log item and remove the reference to it in the buffer.
685 * This call ignores the recursion count. It is only called when the buffer
686 * should REALLY be unlocked, regardless of the recursion count.
688 * We unconditionally drop the transaction's reference to the log item. If the
689 * item was logged, then another reference was taken when it was pinned, so we
690 * can safely drop the transaction reference now. This also allows us to avoid
691 * potential races with the unpin code freeing the bli by not referencing the
692 * bli after we've dropped the reference count.
694 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
695 * if necessary but do not unlock the buffer. This is for support of
696 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
700 xfs_buf_item_release(
701 struct xfs_log_item
*lip
)
703 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
704 struct xfs_buf
*bp
= bip
->bli_buf
;
706 bool hold
= bip
->bli_flags
& XFS_BLI_HOLD
;
707 bool stale
= bip
->bli_flags
& XFS_BLI_STALE
;
708 #if defined(DEBUG) || defined(XFS_WARN)
709 bool ordered
= bip
->bli_flags
& XFS_BLI_ORDERED
;
710 bool dirty
= bip
->bli_flags
& XFS_BLI_DIRTY
;
711 bool aborted
= test_bit(XFS_LI_ABORTED
,
715 trace_xfs_buf_item_release(bip
);
718 * The bli dirty state should match whether the blf has logged segments
719 * except for ordered buffers, where only the bli should be dirty.
721 ASSERT((!ordered
&& dirty
== xfs_buf_item_dirty_format(bip
)) ||
722 (ordered
&& dirty
&& !xfs_buf_item_dirty_format(bip
)));
723 ASSERT(!stale
|| (bip
->__bli_format
.blf_flags
& XFS_BLF_CANCEL
));
726 * Clear the buffer's association with this transaction and
727 * per-transaction state from the bli, which has been copied above.
730 bip
->bli_flags
&= ~(XFS_BLI_LOGGED
| XFS_BLI_HOLD
| XFS_BLI_ORDERED
);
733 * Unref the item and unlock the buffer unless held or stale. Stale
734 * buffers remain locked until final unpin unless the bli is freed by
735 * the unref call. The latter implies shutdown because buffer
736 * invalidation dirties the bli and transaction.
738 released
= xfs_buf_item_put(bip
);
739 if (hold
|| (stale
&& !released
))
741 ASSERT(!stale
|| aborted
);
746 xfs_buf_item_committing(
747 struct xfs_log_item
*lip
,
750 return xfs_buf_item_release(lip
);
754 * This is called to find out where the oldest active copy of the
755 * buf log item in the on disk log resides now that the last log
756 * write of it completed at the given lsn.
757 * We always re-log all the dirty data in a buffer, so usually the
758 * latest copy in the on disk log is the only one that matters. For
759 * those cases we simply return the given lsn.
761 * The one exception to this is for buffers full of newly allocated
762 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
763 * flag set, indicating that only the di_next_unlinked fields from the
764 * inodes in the buffers will be replayed during recovery. If the
765 * original newly allocated inode images have not yet been flushed
766 * when the buffer is so relogged, then we need to make sure that we
767 * keep the old images in the 'active' portion of the log. We do this
768 * by returning the original lsn of that transaction here rather than
772 xfs_buf_item_committed(
773 struct xfs_log_item
*lip
,
776 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
778 trace_xfs_buf_item_committed(bip
);
780 if ((bip
->bli_flags
& XFS_BLI_INODE_ALLOC_BUF
) && lip
->li_lsn
!= 0)
785 #ifdef DEBUG_EXPENSIVE
787 xfs_buf_item_precommit(
788 struct xfs_trans
*tp
,
789 struct xfs_log_item
*lip
)
791 struct xfs_buf_log_item
*bip
= BUF_ITEM(lip
);
792 struct xfs_buf
*bp
= bip
->bli_buf
;
793 struct xfs_mount
*mp
= bp
->b_mount
;
796 if (!bp
->b_ops
|| !bp
->b_ops
->verify_struct
)
798 if (bip
->bli_flags
& XFS_BLI_STALE
)
801 fa
= bp
->b_ops
->verify_struct(bp
);
803 xfs_buf_verifier_error(bp
, -EFSCORRUPTED
, bp
->b_ops
->name
,
804 bp
->b_addr
, BBTOB(bp
->b_length
), fa
);
805 xfs_force_shutdown(mp
, SHUTDOWN_CORRUPT_INCORE
);
812 # define xfs_buf_item_precommit NULL
815 static const struct xfs_item_ops xfs_buf_item_ops
= {
816 .iop_size
= xfs_buf_item_size
,
817 .iop_precommit
= xfs_buf_item_precommit
,
818 .iop_format
= xfs_buf_item_format
,
819 .iop_pin
= xfs_buf_item_pin
,
820 .iop_unpin
= xfs_buf_item_unpin
,
821 .iop_release
= xfs_buf_item_release
,
822 .iop_committing
= xfs_buf_item_committing
,
823 .iop_committed
= xfs_buf_item_committed
,
824 .iop_push
= xfs_buf_item_push
,
828 xfs_buf_item_get_format(
829 struct xfs_buf_log_item
*bip
,
832 ASSERT(bip
->bli_formats
== NULL
);
833 bip
->bli_format_count
= count
;
836 bip
->bli_formats
= &bip
->__bli_format
;
840 bip
->bli_formats
= kzalloc(count
* sizeof(struct xfs_buf_log_format
),
841 GFP_KERNEL
| __GFP_NOFAIL
);
845 xfs_buf_item_free_format(
846 struct xfs_buf_log_item
*bip
)
848 if (bip
->bli_formats
!= &bip
->__bli_format
) {
849 kfree(bip
->bli_formats
);
850 bip
->bli_formats
= NULL
;
855 * Allocate a new buf log item to go with the given buffer.
856 * Set the buffer's b_log_item field to point to the new
862 struct xfs_mount
*mp
)
864 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
870 * Check to see if there is already a buf log item for
871 * this buffer. If we do already have one, there is
872 * nothing to do here so return.
874 ASSERT(bp
->b_mount
== mp
);
876 ASSERT(bip
->bli_item
.li_type
== XFS_LI_BUF
);
877 ASSERT(!bp
->b_transp
);
878 ASSERT(bip
->bli_buf
== bp
);
882 bip
= kmem_cache_zalloc(xfs_buf_item_cache
, GFP_KERNEL
| __GFP_NOFAIL
);
883 xfs_log_item_init(mp
, &bip
->bli_item
, XFS_LI_BUF
, &xfs_buf_item_ops
);
887 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
888 * can be divided into. Make sure not to truncate any pieces.
889 * map_size is the size of the bitmap needed to describe the
890 * chunks of the buffer.
892 * Discontiguous buffer support follows the layout of the underlying
893 * buffer. This makes the implementation as simple as possible.
895 xfs_buf_item_get_format(bip
, bp
->b_map_count
);
897 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
898 chunks
= DIV_ROUND_UP(BBTOB(bp
->b_maps
[i
].bm_len
),
900 map_size
= DIV_ROUND_UP(chunks
, NBWORD
);
902 if (map_size
> XFS_BLF_DATAMAP_SIZE
) {
903 kmem_cache_free(xfs_buf_item_cache
, bip
);
905 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
907 BBTOB(bp
->b_maps
[i
].bm_len
));
908 return -EFSCORRUPTED
;
911 bip
->bli_formats
[i
].blf_type
= XFS_LI_BUF
;
912 bip
->bli_formats
[i
].blf_blkno
= bp
->b_maps
[i
].bm_bn
;
913 bip
->bli_formats
[i
].blf_len
= bp
->b_maps
[i
].bm_len
;
914 bip
->bli_formats
[i
].blf_map_size
= map_size
;
917 bp
->b_log_item
= bip
;
924 * Mark bytes first through last inclusive as dirty in the buf
928 xfs_buf_item_log_segment(
943 ASSERT(first
< XFS_BLF_DATAMAP_SIZE
* XFS_BLF_CHUNK
* NBWORD
);
944 ASSERT(last
< XFS_BLF_DATAMAP_SIZE
* XFS_BLF_CHUNK
* NBWORD
);
947 * Convert byte offsets to bit numbers.
949 first_bit
= first
>> XFS_BLF_SHIFT
;
950 last_bit
= last
>> XFS_BLF_SHIFT
;
953 * Calculate the total number of bits to be set.
955 bits_to_set
= last_bit
- first_bit
+ 1;
958 * Get a pointer to the first word in the bitmap
961 word_num
= first_bit
>> BIT_TO_WORD_SHIFT
;
962 wordp
= &map
[word_num
];
965 * Calculate the starting bit in the first word.
967 bit
= first_bit
& (uint
)(NBWORD
- 1);
970 * First set any bits in the first word of our range.
971 * If it starts at bit 0 of the word, it will be
972 * set below rather than here. That is what the variable
973 * bit tells us. The variable bits_set tracks the number
974 * of bits that have been set so far. End_bit is the number
975 * of the last bit to be set in this word plus one.
978 end_bit
= min(bit
+ bits_to_set
, (uint
)NBWORD
);
979 mask
= ((1U << (end_bit
- bit
)) - 1) << bit
;
982 bits_set
= end_bit
- bit
;
988 * Now set bits a whole word at a time that are between
989 * first_bit and last_bit.
991 while ((bits_to_set
- bits_set
) >= NBWORD
) {
998 * Finally, set any bits left to be set in one last partial word.
1000 end_bit
= bits_to_set
- bits_set
;
1002 mask
= (1U << end_bit
) - 1;
1008 * Mark bytes first through last inclusive as dirty in the buf
1013 struct xfs_buf_log_item
*bip
,
1020 struct xfs_buf
*bp
= bip
->bli_buf
;
1023 * walk each buffer segment and mark them dirty appropriately.
1026 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
1029 end
= start
+ BBTOB(bp
->b_maps
[i
].bm_len
) - 1;
1031 /* skip to the map that includes the first byte to log */
1033 start
+= BBTOB(bp
->b_maps
[i
].bm_len
);
1038 * Trim the range to this segment and mark it in the bitmap.
1039 * Note that we must convert buffer offsets to segment relative
1040 * offsets (e.g., the first byte of each segment is byte 0 of
1047 xfs_buf_item_log_segment(first
- start
, end
- start
,
1048 &bip
->bli_formats
[i
].blf_data_map
[0]);
1050 start
+= BBTOB(bp
->b_maps
[i
].bm_len
);
1056 * Return true if the buffer has any ranges logged/dirtied by a transaction,
1060 xfs_buf_item_dirty_format(
1061 struct xfs_buf_log_item
*bip
)
1065 for (i
= 0; i
< bip
->bli_format_count
; i
++) {
1066 if (!xfs_bitmap_empty(bip
->bli_formats
[i
].blf_data_map
,
1067 bip
->bli_formats
[i
].blf_map_size
))
1076 struct xfs_buf_log_item
*bip
)
1078 xfs_buf_item_free_format(bip
);
1079 kvfree(bip
->bli_item
.li_lv_shadow
);
1080 kmem_cache_free(xfs_buf_item_cache
, bip
);
1084 * xfs_buf_item_relse() is called when the buf log item is no longer needed.
1090 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
1092 trace_xfs_buf_item_relse(bp
, _RET_IP_
);
1093 ASSERT(!test_bit(XFS_LI_IN_AIL
, &bip
->bli_item
.li_flags
));
1095 if (atomic_read(&bip
->bli_refcount
))
1097 bp
->b_log_item
= NULL
;
1099 xfs_buf_item_free(bip
);
1107 * If we are forcibly shutting down, this may well be off the AIL
1108 * already. That's because we simulate the log-committed callbacks to
1109 * unpin these buffers. Or we may never have put this item on AIL
1110 * because of the transaction was aborted forcibly.
1111 * xfs_trans_ail_delete() takes care of these.
1113 * Either way, AIL is useless if we're forcing a shutdown.
1115 * Note that log recovery writes might have buffer items that are not on
1116 * the AIL even when the file system is not shut down.
1118 xfs_trans_ail_delete(&bp
->b_log_item
->bli_item
,
1119 (bp
->b_flags
& _XBF_LOGRECOVERY
) ? 0 :
1120 SHUTDOWN_CORRUPT_INCORE
);
1121 xfs_buf_item_relse(bp
);