4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/signal.h>
45 #include <sys/vnode.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_trans.h>
54 #include <sys/fs/ufs_bio.h>
56 #include <sys/errno.h>
57 #include <sys/sysmacros.h>
59 #include <sys/debug.h>
61 #include <sys/cmn_err.h>
64 * This structure is used to track blocks as we allocate them, so that
65 * we can free them if we encounter an error during allocation. We
66 * keep track of five pieces of information for each allocated block:
67 * - The number of the newly allocated block
68 * - The size of the block (lets us deal with fragments if we want)
69 * - The number of the block containing a pointer to it; or whether
70 * the pointer is in the inode
71 * - The offset within the block (or inode) containing a pointer to it.
72 * - A flag indicating the usage of the block. (Logging needs to know
73 * this to avoid overwriting a data block if it was previously used
78 ufs_no_owner
, /* Owner has not yet been updated */
79 ufs_inode_direct
, /* Listed in inode's direct block table */
80 ufs_inode_indirect
, /* Listed in inode's indirect block table */
81 ufs_indirect_block
/* Listed in an indirect block */
84 struct ufs_allocated_block
{
85 daddr_t this_block
; /* Number of this block */
86 off_t block_size
; /* Size of this block, in bytes */
87 enum ufs_owner_type owner
; /* Who points to this block? */
88 daddr_t owner_block
; /* Number of the owning block */
89 uint_t owner_offset
; /* Offset within that block or inode */
90 int usage_flags
; /* Usage flags, as expected by free() */
94 static int findextent(struct fs
*fs
, daddr32_t
*sbp
, int n
, int *lenp
,
97 static void ufs_undo_allocation(inode_t
*ip
, int block_count
,
98 struct ufs_allocated_block table
[], int inode_sector_adjust
);
101 * Find the extent and the matching block number.
104 * boff indicates that we want a page in the middle
105 * min expression is supposed to make sure no extra page[s] after EOF
107 * we assume that a page is a multiple of bsize, i.e.,
110 * We always return a length that is suitable for a disk transfer.
112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
113 register daddr32_t *dp = (tblp); \
114 register int _chkfrag = chkfrag; /* for lint. sigh */ \
121 len = findextent(fs, dp, (int)(n), lenp, maxtrans) << \
124 register u_offset_t tmp; \
126 tmp = fragroundup((fs), size) - \
127 (((u_offset_t)lbn) << fs->fs_bshift); \
128 len = (int)MIN(tmp, len); \
134 *(bnp) = fsbtodb(fs, *dp) + btodb(boff); \
141 * The maximum supported file size is actually somewhat less that 1
142 * terabyte. This is because the total number of blocks used for the
143 * file and its metadata must fit into the ic_blocks field of the
144 * inode, which is a signed 32-bit quantity. The metadata allocated
145 * for a file (that is, the single, double, and triple indirect blocks
146 * used to reference the file blocks) is actually quite small,
147 * but just to make sure, we check for overflow in the ic_blocks
148 * ic_blocks fields for all files whose total block count is
149 * within 1 GB of a terabyte. VERYLARGEFILESIZE below is the number of
150 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
151 * in a gigabyte (2^21). We only check for overflow in the ic_blocks
152 * field if the number of blocks currently allocated to the file is
153 * greater than VERYLARGEFILESIZE.
155 * Note that file "size" is the not the same as file "length". A
156 * file's "size" is the number of blocks allocated to it. A file's
157 * "length" is the maximum offset in the file. A UFS FILE can have a
158 * length of a terabyte, but the size is limited to somewhat less than
159 * a terabyte, as described above.
161 #define VERYLARGEFILESIZE 0x7FE00000
164 * bmap{read,write} define the structure of file system storage by mapping
165 * a logical offset in a file to a physical block number on the device.
166 * It should be called with a locked inode when allocation is to be
167 * done (bmap_write). Note this strangeness: bmap_write is always called from
168 * getpage(), not putpage(), since getpage() is where all the allocation
171 * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
173 * NOTICE: the block number returned is the disk block number, not the
174 * file system block number. All the worries about block offsets and
175 * page/block sizes are hidden inside of bmap. Well, not quite,
176 * unfortunately. It's impossible to find one place to hide all this
177 * mess. There are 3 cases:
180 * In this case, the {get,put}page routines will attempt to align to
181 * a file system block boundry (XXX - maybe this is a mistake?). Since
182 * the kluster routines may be out of memory, we don't always get all
183 * the pages we wanted. If we called bmap first, to find out how much
184 * to kluster, we handed in the block aligned offset. If we didn't get
185 * all the pages, we have to chop off the amount we didn't get from the
186 * amount handed back by bmap.
189 * Life is quite pleasant here, no extra work needed, mainly because we
190 * (probably?) won't kluster backwards, just forwards.
193 * This one has a different set of problems, specifically, we may have to
194 * do N reads to fill one page. Let us hope that Sun will stay with small
197 * Returns 0 on success, or a non-zero errno if an error occurs.
200 * LMXXX - add a bmap cache. This could be a couple of extents in the
201 * inode. Two is nice for PAGESIZE > bsize.
205 bmap_read(struct inode
*ip
, u_offset_t off
, daddr_t
*bnp
, int *lenp
)
208 ufsvfs_t
*ufsvfsp
= ip
->i_ufsvfs
;
209 struct fs
*fs
= ufsvfsp
->vfs_fs
;
212 int shft
; /* we maintain sh = 1 << shft */
215 int nindirshift
, nindiroffset
;
217 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
218 lbn
= (daddr_t
)lblkno(fs
, off
);
219 boff
= (int)blkoff(fs
, off
);
224 * The first NDADDR blocks are direct blocks.
227 DOEXTENT(fs
, lbn
, boff
, bnp
, lenp
,
228 ip
->i_size
, &ip
->i_db
[lbn
], NDADDR
- lbn
, 1,
229 ufsvfsp
->vfs_iotransz
);
233 nindirshift
= ufsvfsp
->vfs_nindirshift
;
234 nindiroffset
= ufsvfsp
->vfs_nindiroffset
;
236 * Determine how many levels of indirection.
238 shft
= 0; /* sh = 1 */
240 for (j
= NIADDR
; j
> 0; j
--) {
243 shft
+= nindirshift
; /* sh *= nindir */
253 * Fetch the first indirect block.
255 nb
= ip
->i_ib
[NIADDR
- j
];
262 * Fetch through the indirect blocks.
264 for (; j
<= NIADDR
; j
++) {
266 bp
= UFS_BREAD(ufsvfsp
,
267 ip
->i_dev
, fsbtodb(fs
, ob
), fs
->fs_bsize
);
268 if (bp
->b_flags
& B_ERROR
) {
272 bap
= bp
->b_un
.b_daddr
;
274 ASSERT(!ufs_indir_badblock(ip
, bap
));
276 shft
-= nindirshift
; /* sh / nindir */
277 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
287 DOEXTENT(fs
, lbn
, boff
, bnp
, lenp
, ip
->i_size
, &bap
[i
],
288 MIN(NINDIR(fs
) - i
, (daddr_t
)lblkno(fs
, ip
->i_size
- 1) - lbn
+ 1),
289 0, ufsvfsp
->vfs_iotransz
);
295 * See bmap_read for general notes.
297 * The block must be at least size bytes and will be extended or
298 * allocated as needed. If alloc_type is of type BI_ALLOC_ONLY, then bmap
299 * will not create any in-core pages that correspond to the new disk allocation.
300 * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
301 * and security is maintained b/c upon reading a negative block number pages
302 * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
303 * be created and initialized as needed.
305 * Returns 0 on success, or a non-zero errno if an error occurs.
308 bmap_write(struct inode
*ip
, u_offset_t off
, int size
,
309 enum bi_type alloc_type
, daddr_t
*allocblk
, struct cred
*cr
)
316 int shft
; /* we maintain sh = 1 << shft */
317 daddr_t ob
, nb
, pref
, lbn
, llbn
, tbn
;
319 struct vnode
*vp
= ITOV(ip
);
320 long bsize
= VBSIZE(vp
);
322 int issync
, metaflag
, isdirquota
;
328 struct ufsvfs
*ufsvfsp
;
329 int added_sectors
; /* sectors added to this inode */
330 int alloced_blocks
; /* fs blocks newly allocated */
331 struct ufs_allocated_block undo_table
[NIADDR
+1];
332 int verylargefile
= 0;
334 ASSERT(RW_WRITE_HELD(&ip
->i_contents
));
339 ufsvfsp
= ip
->i_ufsvfs
;
340 fs
= ufsvfsp
->vfs_bufp
->b_un
.b_fs
;
341 lbn
= (daddr_t
)lblkno(fs
, off
);
344 if (ip
->i_blocks
>= VERYLARGEFILESIZE
)
346 llbn
= (daddr_t
)((ip
->i_size
) ? lblkno(fs
, ip
->i_size
- 1) : 0);
347 metaflag
= isdirquota
= 0;
348 if (((ip
->i_mode
& IFMT
) == IFDIR
) ||
349 ((ip
->i_mode
& IFMT
) == IFATTRDIR
))
350 isdirquota
= metaflag
= I_DIR
;
351 else if ((ip
->i_mode
& IFMT
) == IFSHAD
)
353 else if (ip
->i_ufsvfs
->vfs_qinod
== ip
)
354 isdirquota
= metaflag
= I_QUOTA
;
356 issync
= ((ip
->i_flag
& ISYNC
) != 0);
358 if (isdirquota
|| issync
) {
359 alloc_type
= BI_NORMAL
; /* make sure */
363 * If the next write will extend the file into a new block,
364 * and the file is currently composed of a fragment
365 * this fragment has to be extended to be a full block.
367 if (llbn
< NDADDR
&& llbn
< lbn
&& (ob
= ip
->i_db
[llbn
]) != 0) {
368 osize
= blksize(fs
, ip
, llbn
);
369 if (osize
< bsize
&& osize
> 0) {
371 * Check to see if doing this will make the file too
372 * big. Only check if we are dealing with a very
375 if (verylargefile
== 1) {
376 if (((unsigned)ip
->i_blocks
+
377 btodb(bsize
- osize
)) > INT_MAX
) {
382 * Make sure we have all needed pages setup correctly.
384 * We pass S_OTHER to fbread here because we want
385 * an exclusive lock on the page in question
386 * (see ufs_getpage). I/O to the old block location
387 * may still be in progress and we are about to free
388 * the old block. We don't want anyone else to get
389 * a hold of the old block once we free it until
390 * the I/O is complete.
393 fbread(ITOV(ip
), ((offset_t
)llbn
<< fs
->fs_bshift
),
394 (uint_t
)bsize
, S_OTHER
, &fbp
);
397 pref
= blkpref(ip
, llbn
, (int)llbn
, &ip
->i_db
[0]);
398 err
= realloccg(ip
, ob
, pref
, (int)osize
, (int)bsize
,
402 fbrelse(fbp
, S_OTHER
);
405 ASSERT(!ufs_badblock(ip
, nb
));
408 * Update the inode before releasing the
409 * lock on the page. If we released the page
410 * lock first, the data could be written to it's
411 * old address and then destroyed.
413 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 0);
415 UFS_SET_ISIZE(((u_offset_t
)(llbn
+ 1)) << fs
->fs_bshift
,
417 ip
->i_blocks
+= btodb(bsize
- osize
);
418 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
419 TRANS_INODE(ufsvfsp
, ip
);
420 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
422 /* Caller is responsible for updating i_seq */
424 * Don't check metaflag here, directories won't do this
428 (void) ufs_fbiwrite(fbp
, ip
, nb
, fs
->fs_fsize
);
431 fbrelse(fbp
, S_WRITE
);
435 (void) free(ip
, ob
, (off_t
)osize
, metaflag
);
441 * The first NDADDR blocks are direct blocks.
446 ip
->i_size
< ((u_offset_t
)(lbn
+ 1)) << fs
->fs_bshift
) {
448 /* consider need to reallocate a frag */
449 osize
= fragroundup(fs
, blkoff(fs
, ip
->i_size
));
450 nsize
= fragroundup(fs
, size
);
454 * Check to see if doing this will make the
455 * file too big. Only check if we are dealing
456 * with a very large file.
458 if (verylargefile
== 1) {
459 if (((unsigned)ip
->i_blocks
+
460 btodb(nsize
- osize
)) > INT_MAX
) {
465 * need to re-allocate a block or frag
468 pref
= blkpref(ip
, lbn
, (int)lbn
,
470 err
= realloccg(ip
, ob
, pref
, (int)osize
,
471 (int)nsize
, &nb
, cr
);
476 ASSERT(!ufs_badblock(ip
, nb
));
480 * need to allocate a block or frag
484 ((u_offset_t
)(lbn
+ 1)) << fs
->fs_bshift
)
485 nsize
= fragroundup(fs
, size
);
489 * Check to see if doing this will make the
490 * file too big. Only check if we are dealing
491 * with a very large file.
493 if (verylargefile
== 1) {
494 if (((unsigned)ip
->i_blocks
+
495 btodb(nsize
- osize
)) > INT_MAX
) {
499 pref
= blkpref(ip
, lbn
, (int)lbn
, &ip
->i_db
[0]);
500 err
= alloc(ip
, pref
, (int)nsize
, &nb
, cr
);
505 ASSERT(!ufs_badblock(ip
, nb
));
510 * Read old/create new zero pages
515 * mmap S_WRITE faults always enter here
518 * We zero it if its also BI_FALLOCATE, but
519 * only for direct blocks!
521 if (alloc_type
== BI_NORMAL
||
522 alloc_type
== BI_FALLOCATE
||
523 P2ROUNDUP_TYPED(size
,
524 PAGESIZE
, u_offset_t
) < nsize
) {
525 /* fbzero doesn't cause a pagefault */
527 ((offset_t
)lbn
<< fs
->fs_bshift
),
528 (uint_t
)nsize
, &fbp
);
532 ((offset_t
)lbn
<< fs
->fs_bshift
),
533 (uint_t
)nsize
, S_OTHER
, &fbp
);
537 (off_t
)nsize
, metaflag
);
540 ob
+ numfrags(fs
, osize
),
541 (off_t
)(nsize
- osize
),
544 ASSERT(nsize
>= osize
);
546 -(long)btodb(nsize
- osize
),
547 0, cr
, (char **)NULL
,
552 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, nsize
, 0);
554 ip
->i_blocks
+= btodb(nsize
- osize
);
555 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
556 TRANS_INODE(ufsvfsp
, ip
);
557 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
559 /* Caller is responsible for updating i_seq */
562 * Write directory and shadow blocks synchronously so
563 * that they never appear with garbage in them on the
567 if (isdirquota
&& (ip
->i_size
||
568 TRANS_ISTRANS(ufsvfsp
))) {
570 * XXX man not be necessary with harpy trans
573 (void) ufs_fbiwrite(fbp
, ip
, nb
, fs
->fs_fsize
);
575 fbrelse(fbp
, S_WRITE
);
579 (void) free(ip
, ob
, (off_t
)osize
, metaflag
);
585 added_sectors
= alloced_blocks
= 0; /* No blocks alloced yet */
588 * Determine how many levels of indirection.
590 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
591 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
593 shft
= 0; /* sh = 1 */
595 for (j
= NIADDR
; j
> 0; j
--) {
598 shft
+= nindirshift
; /* sh *= nindir */
609 * Fetch the first indirect block.
612 nb
= ip
->i_ib
[NIADDR
- j
];
615 * Check to see if doing this will make the
616 * file too big. Only check if we are dealing
617 * with a very large file.
619 if (verylargefile
== 1) {
620 if (((unsigned)ip
->i_blocks
+ btodb(bsize
))
626 * Need to allocate an indirect block.
628 pref
= blkpref(ip
, lbn
, 0, (daddr32_t
*)0);
629 err
= alloc(ip
, pref
, (int)bsize
, &nb
, cr
);
632 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 1);
633 ASSERT(!ufs_badblock(ip
, nb
));
636 * Keep track of this allocation so we can undo it if we
637 * get an error later.
640 ASSERT(alloced_blocks
<= NIADDR
);
642 undo_table
[alloced_blocks
].this_block
= nb
;
643 undo_table
[alloced_blocks
].block_size
= bsize
;
644 undo_table
[alloced_blocks
].owner
= ufs_no_owner
;
645 undo_table
[alloced_blocks
].usage_flags
= metaflag
| I_IBLK
;
650 * Write zero block synchronously so that
651 * indirect blocks never point at garbage.
653 bp
= UFS_GETBLK(ufsvfsp
, dev
, fsbtodb(fs
, nb
), bsize
);
656 /* XXX Maybe special-case this? */
657 TRANS_BUF(ufsvfsp
, 0, bsize
, bp
, DT_ABZERO
);
658 UFS_BWRITE2(ufsvfsp
, bp
);
659 if (bp
->b_flags
& B_ERROR
) {
662 ufs_undo_allocation(ip
, alloced_blocks
,
663 undo_table
, added_sectors
);
668 ip
->i_ib
[NIADDR
- j
] = nb
;
669 added_sectors
+= btodb(bsize
);
670 ip
->i_blocks
+= btodb(bsize
);
671 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
672 TRANS_INODE(ufsvfsp
, ip
);
673 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
674 /* Caller is responsible for updating i_seq */
677 * Update the 'undo table' now that we've linked this block
681 undo_table
[alloced_blocks
-1].owner
= ufs_inode_indirect
;
682 undo_table
[alloced_blocks
-1].owner_offset
= NIADDR
- j
;
685 * In the ISYNC case, wrip will notice that the block
686 * count on the inode has changed and will be sure to
687 * ufs_iupdat the inode at the end of wrip.
692 * Fetch through the indirect blocks.
694 for (; j
<= NIADDR
; j
++) {
696 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, fsbtodb(fs
, ob
), bsize
);
698 if (bp
->b_flags
& B_ERROR
) {
702 * Return any partial allocations.
704 * It is possible that we have not yet made any
705 * allocations at this point (if this is the first
706 * pass through the loop and we didn't have to
707 * allocate the first indirect block, above).
708 * In this case, alloced_blocks and added_sectors will
709 * be zero, and ufs_undo_allocation will do nothing.
711 ufs_undo_allocation(ip
, alloced_blocks
,
712 undo_table
, added_sectors
);
715 bap
= bp
->b_un
.b_daddr
;
716 shft
-= nindirshift
; /* sh /= nindir */
717 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
722 * Check to see if doing this will make the
723 * file too big. Only check if we are dealing
724 * with a very large file.
726 if (verylargefile
== 1) {
727 if (((unsigned)ip
->i_blocks
+ btodb(bsize
))
730 ufs_undo_allocation(ip
, alloced_blocks
,
731 undo_table
, added_sectors
);
738 pref
= blkpref(ip
, lbn
, 0,
742 pref
= blkpref(ip
, lbn
, i
, &bap
[0]);
747 * release "bp" buf to avoid deadlock (re-bread later)
751 err
= alloc(ip
, pref
, (int)bsize
, &nb
, cr
);
754 * Return any partial allocations.
756 ufs_undo_allocation(ip
, alloced_blocks
,
757 undo_table
, added_sectors
);
761 ASSERT(!ufs_badblock(ip
, nb
));
762 ASSERT(alloced_blocks
<= NIADDR
);
767 undo_table
[alloced_blocks
].this_block
= nb
;
768 undo_table
[alloced_blocks
].block_size
= bsize
;
769 undo_table
[alloced_blocks
].owner
= ufs_no_owner
;
770 undo_table
[alloced_blocks
].usage_flags
= metaflag
|
771 ((j
< NIADDR
) ? I_IBLK
: 0);
776 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 1);
778 * Write synchronously so indirect
779 * blocks never point at garbage.
782 ufsvfsp
, dev
, fsbtodb(fs
, nb
), bsize
);
785 /* XXX Maybe special-case this? */
786 TRANS_BUF(ufsvfsp
, 0, bsize
, nbp
, DT_ABZERO
);
787 UFS_BWRITE2(ufsvfsp
, nbp
);
788 if (nbp
->b_flags
& B_ERROR
) {
795 ufs_undo_allocation(ip
,
797 undo_table
, added_sectors
);
801 } else if (alloc_type
== BI_NORMAL
||
802 P2ROUNDUP_TYPED(size
,
803 PAGESIZE
, u_offset_t
) < bsize
) {
804 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 0);
806 ((offset_t
)lbn
<< fs
->fs_bshift
),
807 (uint_t
)bsize
, &fbp
);
810 * Cases which we need to do a synchronous
811 * write of the zeroed data pages:
813 * 1) If we are writing a directory then we
814 * want to write synchronously so blocks in
815 * directories never contain garbage.
817 * 2) If we are filling in a hole and the
818 * indirect block is going to be synchronously
819 * written back below we need to make sure
820 * that the zeroes are written here before
821 * the indirect block is updated so that if
822 * we crash before the real data is pushed
823 * we will not end up with random data is
824 * the middle of the file.
826 * 3) If the size of the request rounded up
827 * to the system page size is smaller than
828 * the file system block size, we want to
829 * write out all the pages now so that
830 * they are not aborted before they actually
831 * make it to ufs_putpage since the length
832 * of the inode will not include the pages.
835 if (isdirquota
|| (issync
&&
837 (void) ufs_fbiwrite(fbp
, ip
, nb
,
840 fbrelse(fbp
, S_WRITE
);
844 * re-acquire "bp" buf
846 bp
= UFS_BREAD(ufsvfsp
,
847 ip
->i_dev
, fsbtodb(fs
, ob
), bsize
);
848 if (bp
->b_flags
& B_ERROR
) {
852 * Return any partial allocations.
854 ufs_undo_allocation(ip
,
856 undo_table
, added_sectors
);
859 bap
= bp
->b_un
.b_daddr
;
863 * The magic explained: j will be equal to NIADDR
864 * when we are at the lowest level, this is where the
865 * array entries point directly to data blocks. Since
866 * we will be 'fallocate'ing we will go ahead and negate
869 if (alloc_type
== BI_FALLOCATE
&& j
== NIADDR
)
872 TRANS_BUF_ITEM_128(ufsvfsp
, bap
[i
], bap
, bp
, DT_AB
);
873 added_sectors
+= btodb(bsize
);
874 ip
->i_blocks
+= btodb(bsize
);
875 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
876 TRANS_INODE(ufsvfsp
, ip
);
877 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
879 /* Caller is responsible for updating i_seq */
881 undo_table
[alloced_blocks
-1].owner
=
883 undo_table
[alloced_blocks
-1].owner_block
= ob
;
884 undo_table
[alloced_blocks
-1].owner_offset
= i
;
887 UFS_BWRITE2(ufsvfsp
, bp
);
888 if (bp
->b_flags
& B_ERROR
) {
895 ufs_undo_allocation(ip
,
897 undo_table
, added_sectors
);
912 * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
913 * is in the critical region of wrip().
916 bmap_has_holes(struct inode
*ip
)
918 struct fs
*fs
= ip
->i_fs
;
919 uint_t dblks
; /* # of data blocks */
920 uint_t mblks
; /* # of data + metadata blocks */
927 int fsbshift
= fs
->fs_bshift
;
928 int fsboffset
= (1 << fsbshift
) - 1;
931 * Check for writer in critical region, if found then we
932 * cannot trust the values of i_size and i_blocks
933 * simply return true.
935 if (ip
->i_writer
!= NULL
&& ip
->i_writer
!= curthread
) {
939 dblks
= (ip
->i_size
+ fsboffset
) >> fsbshift
;
940 mblks
= (ldbtob((u_offset_t
)ip
->i_blocks
) + fsboffset
) >> fsbshift
;
943 * File has only direct blocks.
946 return (mblks
< dblks
);
947 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
949 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
950 nindirblks
= nindiroffset
+ 1;
955 * Determine how many levels of indirection.
957 for (j
= NIADDR
; j
> 0; j
--) {
960 shft
+= nindirshift
; /* sh *= nindir */
966 /* LINTED: warning: logical expression always true: op "||" */
969 if (j
== NIADDR
) /* single level indirection */
970 cnt
= NDADDR
+ 1 + dblks
;
971 else if (j
== NIADDR
-1) /* double indirection */
972 cnt
= NDADDR
+ 1 + nindirblks
+
973 1 + (dblks
+ nindiroffset
)/nindirblks
+ dblks
;
974 else if (j
== NIADDR
-2) { /* triple indirection */
975 n
= (dblks
+ nindiroffset
)/nindirblks
;
976 cnt
= NDADDR
+ 1 + nindirblks
+
977 1 + nindirblks
+ nindirblks
*nindirblks
+
978 1 + (n
+ nindiroffset
)/nindirblks
+ n
+ dblks
;
981 return (mblks
< cnt
);
985 * find some contig blocks starting at *sbp and going for min(n, max_contig)
986 * return the number of blocks (not frags) found.
987 * The array passed in must be at least [0..n-1].
990 findextent(struct fs
*fs
, daddr32_t
*sbp
, int n
, int *lenp
, int maxtransfer
)
992 register daddr_t bn
, nextbn
;
993 register daddr32_t
*bp
;
1005 n
= MIN(n
, lblkno(fs
, *lenp
));
1008 * If the user has set the value for maxcontig lower than
1009 * the drive transfer size, then assume they want this
1010 * to be the maximum value for the size of the data transfer.
1012 maxtransblk
= maxtransfer
>> DEV_BSHIFT
;
1013 if (fs
->fs_maxcontig
< maxtransblk
) {
1014 n
= MIN(n
, fs
->fs_maxcontig
);
1016 n
= MIN(n
, maxtransblk
);
1022 if (nextbn
== 0 || bn
+ diff
!= nextbn
)
1027 return ((int)(bp
- sbp
) + 1);
1031 * Free any blocks which had been successfully allocated. Always called
1032 * as a result of an error, so we don't bother returning an error code
1035 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1036 * Thus it is safe to call this as part of error handling, whether or not
1037 * any blocks have been allocated.
1039 * The ufs_inode_direct case is currently unused.
1043 ufs_undo_allocation(
1046 struct ufs_allocated_block table
[],
1047 int inode_sector_adjust
)
1051 int error_updating_pointers
;
1052 struct ufsvfs
*ufsvfsp
;
1055 error_updating_pointers
= 0;
1057 ufsvfsp
= ip
->i_ufsvfs
;
1060 * Update pointers on disk before freeing blocks. If we fail,
1061 * some blocks may remain busy; but they will be reclaimed by
1062 * an fsck. (This is better than letting a block wind up with
1063 * two owners if we successfully freed it but could not remove
1064 * the pointer to it.)
1067 for (i
= 0; i
< block_count
; i
++) {
1068 switch (table
[i
].owner
) {
1070 /* Nothing to do here, nobody points to us */
1072 case ufs_inode_direct
:
1073 ASSERT(table
[i
].owner_offset
< NDADDR
);
1074 ip
->i_db
[table
[i
].owner_offset
] = 0;
1077 case ufs_inode_indirect
:
1078 ASSERT(table
[i
].owner_offset
< NIADDR
);
1079 ip
->i_ib
[table
[i
].owner_offset
] = 0;
1082 case ufs_indirect_block
: {
1084 daddr32_t
*block_data
;
1086 /* Read/modify/log/write. */
1088 ASSERT(table
[i
].owner_offset
<
1089 (VBSIZE(ITOV(ip
)) / sizeof (daddr32_t
)));
1091 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
,
1092 fsbtodb(ufsvfsp
->vfs_fs
, table
[i
].owner_block
),
1095 if (bp
->b_flags
& B_ERROR
) {
1096 /* Couldn't read this block; give up. */
1097 error_updating_pointers
= 1;
1099 break; /* out of SWITCH */
1102 block_data
= bp
->b_un
.b_daddr
;
1103 block_data
[table
[i
].owner_offset
] = 0;
1105 /* Write a log entry which includes the zero. */
1106 /* It might be possible to optimize this by using */
1107 /* TRANS_BUF directly and zeroing only the four */
1108 /* bytes involved, but an attempt to do that led */
1109 /* to panics in the logging code. The attempt was */
1110 /* TRANS_BUF(ufsvfsp, */
1111 /* table[i].owner_offset * sizeof (daddr32_t), */
1112 /* sizeof (daddr32_t), */
1116 TRANS_BUF_ITEM_128(ufsvfsp
,
1117 block_data
[table
[i
].owner_offset
],
1118 block_data
, bp
, DT_AB
);
1120 /* Now we can write the buffer itself. */
1122 UFS_BWRITE2(ufsvfsp
, bp
);
1124 if (bp
->b_flags
& B_ERROR
) {
1125 error_updating_pointers
= 1;
1132 (void) ufs_fault(ITOV(ip
),
1133 "ufs_undo_allocation failure\n");
1139 * If the inode changed, or if we need to update its block count,
1140 * then do that now. We update the inode synchronously on disk
1141 * to ensure that it won't transiently point at a block we've
1142 * freed (only necessary if we're not logging).
1144 * NOTE: Currently ufs_iupdat() does not check for errors. When
1145 * it is fixed, we should verify that we successfully updated the
1146 * inode before freeing blocks below.
1149 if (inode_changed
|| (inode_sector_adjust
!= 0)) {
1150 ip
->i_blocks
-= inode_sector_adjust
;
1151 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
1152 TRANS_INODE(ufsvfsp
, ip
);
1153 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
1155 if (!TRANS_ISTRANS(ufsvfsp
))
1156 ufs_iupdat(ip
, I_SYNC
);
1160 * Now we go through and actually free the blocks, but only if we
1161 * successfully removed the pointers to them.
1164 if (!error_updating_pointers
) {
1165 for (i
= 0; i
< block_count
; i
++) {
1166 free(ip
, table
[i
].this_block
, table
[i
].block_size
,
1167 table
[i
].usage_flags
);
1173 * Find the next hole or data block in file starting at *off
1174 * Return found offset in *off, which can be less than the
1175 * starting offset if not block aligned.
1176 * This code is based on bmap_read().
1177 * Errors: ENXIO for end of file
1178 * EIO for block read error.
1181 bmap_find(struct inode
*ip
, boolean_t hole
, u_offset_t
*off
)
1183 ufsvfs_t
*ufsvfsp
= ip
->i_ufsvfs
;
1184 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1187 int shft
; /* we maintain sh = 1 << shft */
1188 int nindirshift
, nindiroffset
;
1189 daddr_t ob
, nb
, tbn
, lbn
, skip
;
1191 u_offset_t isz
= (offset_t
)ip
->i_size
;
1192 int32_t bs
= fs
->fs_bsize
; /* file system block size */
1193 int32_t nindir
= fs
->fs_nindir
;
1196 daddr_t limits
[NIADDR
];
1199 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
1200 lbn
= (daddr_t
)lblkno(fs
, *off
);
1203 for (i
= 0; i
< NIADDR
; i
++)
1207 * The first NDADDR blocks are direct blocks.
1210 for (; lbn
< NDADDR
; lbn
++) {
1211 if ((hole
&& (ip
->i_db
[lbn
] == 0)) ||
1212 (!hole
&& (ip
->i_db
[lbn
] != 0))) {
1216 if ((u_offset_t
)lbn
<< fs
->fs_bshift
>= isz
)
1220 nindir
= fs
->fs_nindir
;
1221 nindirshift
= ufsvfsp
->vfs_nindirshift
;
1222 nindiroffset
= ufsvfsp
->vfs_nindiroffset
;
1225 /* Set up limits array */
1226 for (limits
[0] = NDADDR
, j
= 1; j
< NIADDR
; j
++)
1227 limits
[j
] = limits
[j
-1] + (1ULL << (nindirshift
* j
));
1231 * Determine how many levels of indirection.
1233 shft
= 0; /* sh = 1 */
1235 for (j
= NIADDR
; j
> 0; j
--) {
1238 shft
+= nindirshift
; /* sh *= nindir */
1245 /* must have passed end of file */
1246 ASSERT(((u_offset_t
)lbn
<< fs
->fs_bshift
) >= isz
);
1251 * Fetch the first indirect block.
1253 nb
= ip
->i_ib
[NIADDR
- j
];
1256 lbn
= limits
[NIADDR
- j
];
1259 lbn
= limits
[NIADDR
- j
+ 1];
1260 if ((u_offset_t
)lbn
<< fs
->fs_bshift
>= isz
)
1267 * Fetch through the indirect blocks.
1269 for (; ((j
<= NIADDR
) && (nb
!= 0)); j
++) {
1272 * if there's a different block at this level then release
1273 * the old one and in with the new.
1275 if ((bp
[j
-1] == NULL
) || bp
[j
-1]->b_blkno
!= fsbtodb(fs
, ob
)) {
1276 if (bp
[j
-1] != NULL
)
1278 bp
[j
-1] = UFS_BREAD(ufsvfsp
, dev
, fsbtodb(fs
, ob
), bs
);
1279 if (bp
[j
-1]->b_flags
& B_ERROR
) {
1284 bap
= bp
[j
-1]->b_un
.b_daddr
;
1286 shft
-= nindirshift
; /* sh / nindir */
1287 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
1289 skip
= 1LL << (nindirshift
* (NIADDR
- j
));
1293 * Scan through the blocks in this array.
1295 for (; i
< nindir
; i
++, lbn
+= skip
) {
1296 if (hole
&& (bap
[i
] == 0))
1298 if (!hole
&& (bap
[i
] != 0)) {
1300 /* we're at the lowest level */
1307 if (((u_offset_t
)lbn
<< fs
->fs_bshift
) < isz
)
1310 for (i
= 0; i
< NIADDR
; i
++) {
1315 if (((u_offset_t
)lbn
<< fs
->fs_bshift
) >= isz
) {
1319 *off
= (u_offset_t
)lbn
<< fs
->fs_bshift
;
1326 * Set a particular offset in the inode list to be a certain block.
1327 * User is responsible for calling TRANS* functions
1330 bmap_set_bn(struct vnode
*vp
, u_offset_t off
, daddr32_t bn
)
1338 int shft
; /* we maintain sh = 1 << shft */
1340 daddr_t ob
, nb
, tbn
;
1342 int nindirshift
, nindiroffset
;
1345 ufsvfsp
= ip
->i_ufsvfs
;
1346 fs
= ufsvfsp
->vfs_fs
;
1347 lbn
= (daddr_t
)lblkno(fs
, off
);
1349 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
1355 * Take care of direct block assignment
1362 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
1363 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
1365 * Determine how many levels of indirection.
1367 shft
= 0; /* sh = 1 */
1369 for (j
= NIADDR
; j
> 0; j
--) {
1372 shft
+= nindirshift
; /* sh *= nindir */
1382 * Fetch the first indirect block.
1384 nb
= ip
->i_ib
[NIADDR
- j
];
1386 err
= ufs_fault(ITOV(ip
), "ufs_set_bn: nb == UFS_HOLE");
1391 * Fetch through the indirect blocks.
1393 for (; j
<= NIADDR
; j
++) {
1395 bp
= UFS_BREAD(ufsvfsp
,
1396 ip
->i_dev
, fsbtodb(fs
, ob
), fs
->fs_bsize
);
1397 if (bp
->b_flags
& B_ERROR
) {
1402 bap
= bp
->b_un
.b_daddr
;
1404 ASSERT(!ufs_indir_badblock(ip
, bap
));
1406 shft
-= nindirshift
; /* sh / nindir */
1407 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
1411 err
= ufs_fault(ITOV(ip
), "ufs_set_bn: nb == UFS_HOLE");