4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/signal.h>
45 #include <sys/vnode.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_trans.h>
54 #include <sys/fs/ufs_bio.h>
56 #include <sys/errno.h>
57 #include <sys/sysmacros.h>
59 #include <sys/debug.h>
61 #include <sys/cmn_err.h>
64 * This structure is used to track blocks as we allocate them, so that
65 * we can free them if we encounter an error during allocation. We
66 * keep track of five pieces of information for each allocated block:
67 * - The number of the newly allocated block
68 * - The size of the block (lets us deal with fragments if we want)
69 * - The number of the block containing a pointer to it; or whether
70 * the pointer is in the inode
71 * - The offset within the block (or inode) containing a pointer to it.
72 * - A flag indicating the usage of the block. (Logging needs to know
73 * this to avoid overwriting a data block if it was previously used
78 ufs_no_owner
, /* Owner has not yet been updated */
79 ufs_inode_direct
, /* Listed in inode's direct block table */
80 ufs_inode_indirect
, /* Listed in inode's indirect block table */
81 ufs_indirect_block
/* Listed in an indirect block */
84 struct ufs_allocated_block
{
85 daddr_t this_block
; /* Number of this block */
86 off_t block_size
; /* Size of this block, in bytes */
87 enum ufs_owner_type owner
; /* Who points to this block? */
88 daddr_t owner_block
; /* Number of the owning block */
89 uint_t owner_offset
; /* Offset within that block or inode */
90 int usage_flags
; /* Usage flags, as expected by free() */
94 static int findextent(struct fs
*fs
, daddr32_t
*sbp
, int n
, int *lenp
,
97 static void ufs_undo_allocation(inode_t
*ip
, int block_count
,
98 struct ufs_allocated_block table
[], int inode_sector_adjust
);
101 * Find the extent and the matching block number.
104 * boff indicates that we want a page in the middle
105 * min expression is supposed to make sure no extra page[s] after EOF
107 * we assume that a page is a multiple of bsize, i.e.,
110 * We always return a length that is suitable for a disk transfer.
112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
113 register daddr32_t *dp = (tblp); \
114 register int _chkfrag = chkfrag; /* for lint. sigh */ \
121 len = findextent(fs, dp, (int)(n), lenp, maxtrans) << \
124 register uoff_t tmp; \
126 tmp = fragroundup((fs), size) - \
127 (((uoff_t)lbn) << fs->fs_bshift); \
128 len = (int)MIN(tmp, len); \
134 *(bnp) = fsbtodb(fs, *dp) + btodb(boff); \
141 * The maximum supported file size is actually somewhat less that 1
142 * terabyte. This is because the total number of blocks used for the
143 * file and its metadata must fit into the ic_blocks field of the
144 * inode, which is a signed 32-bit quantity. The metadata allocated
145 * for a file (that is, the single, double, and triple indirect blocks
146 * used to reference the file blocks) is actually quite small,
147 * but just to make sure, we check for overflow in the ic_blocks
148 * ic_blocks fields for all files whose total block count is
149 * within 1 GB of a terabyte. VERYLARGEFILESIZE below is the number of
150 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
151 * in a gigabyte (2^21). We only check for overflow in the ic_blocks
152 * field if the number of blocks currently allocated to the file is
153 * greater than VERYLARGEFILESIZE.
155 * Note that file "size" is the not the same as file "length". A
156 * file's "size" is the number of blocks allocated to it. A file's
157 * "length" is the maximum offset in the file. A UFS FILE can have a
158 * length of a terabyte, but the size is limited to somewhat less than
159 * a terabyte, as described above.
161 #define VERYLARGEFILESIZE 0x7FE00000
164 * bmap{read,write} define the structure of file system storage by mapping
165 * a logical offset in a file to a physical block number on the device.
166 * It should be called with a locked inode when allocation is to be
167 * done (bmap_write). Note this strangeness: bmap_write is always called from
168 * getpage(), not putpage(), since getpage() is where all the allocation
171 * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
173 * NOTICE: the block number returned is the disk block number, not the
174 * file system block number. All the worries about block offsets and
175 * page/block sizes are hidden inside of bmap. Well, not quite,
176 * unfortunately. It's impossible to find one place to hide all this
177 * mess. There are 3 cases:
180 * In this case, the {get,put}page routines will attempt to align to
181 * a file system block boundry (XXX - maybe this is a mistake?). Since
182 * the kluster routines may be out of memory, we don't always get all
183 * the pages we wanted. If we called bmap first, to find out how much
184 * to kluster, we handed in the block aligned offset. If we didn't get
185 * all the pages, we have to chop off the amount we didn't get from the
186 * amount handed back by bmap.
189 * Life is quite pleasant here, no extra work needed, mainly because we
190 * (probably?) won't kluster backwards, just forwards.
193 * This one has a different set of problems, specifically, we may have to
194 * do N reads to fill one page. Let us hope that Sun will stay with small
197 * Returns 0 on success, or a non-zero errno if an error occurs.
200 * LMXXX - add a bmap cache. This could be a couple of extents in the
201 * inode. Two is nice for PAGESIZE > bsize.
205 bmap_read(struct inode
*ip
, uoff_t off
, daddr_t
*bnp
, int *lenp
)
208 ufsvfs_t
*ufsvfsp
= ip
->i_ufsvfs
;
209 struct fs
*fs
= ufsvfsp
->vfs_fs
;
212 int shft
; /* we maintain sh = 1 << shft */
215 int nindirshift
, nindiroffset
;
217 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
218 lbn
= (daddr_t
)lblkno(fs
, off
);
219 boff
= (int)blkoff(fs
, off
);
224 * The first NDADDR blocks are direct blocks.
227 DOEXTENT(fs
, lbn
, boff
, bnp
, lenp
,
228 ip
->i_size
, &ip
->i_db
[lbn
], NDADDR
- lbn
, 1,
229 ufsvfsp
->vfs_iotransz
);
233 nindirshift
= ufsvfsp
->vfs_nindirshift
;
234 nindiroffset
= ufsvfsp
->vfs_nindiroffset
;
236 * Determine how many levels of indirection.
238 shft
= 0; /* sh = 1 */
240 for (j
= NIADDR
; j
> 0; j
--) {
243 shft
+= nindirshift
; /* sh *= nindir */
253 * Fetch the first indirect block.
255 nb
= ip
->i_ib
[NIADDR
- j
];
262 * Fetch through the indirect blocks.
264 for (; j
<= NIADDR
; j
++) {
266 bp
= UFS_BREAD(ufsvfsp
,
267 ip
->i_dev
, fsbtodb(fs
, ob
), fs
->fs_bsize
);
268 if (bp
->b_flags
& B_ERROR
) {
272 bap
= bp
->b_un
.b_daddr
;
274 ASSERT(!ufs_indir_badblock(ip
, bap
));
276 shft
-= nindirshift
; /* sh / nindir */
277 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
287 DOEXTENT(fs
, lbn
, boff
, bnp
, lenp
, ip
->i_size
, &bap
[i
],
288 MIN(NINDIR(fs
) - i
, (daddr_t
)lblkno(fs
, ip
->i_size
- 1) - lbn
+ 1),
289 0, ufsvfsp
->vfs_iotransz
);
295 * See bmap_read for general notes.
297 * The block must be at least size bytes and will be extended or
298 * allocated as needed. If alloc_type is of type BI_ALLOC_ONLY, then bmap
299 * will not create any in-core pages that correspond to the new disk allocation.
300 * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
301 * and security is maintained b/c upon reading a negative block number pages
302 * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
303 * be created and initialized as needed.
305 * Returns 0 on success, or a non-zero errno if an error occurs.
308 bmap_write(struct inode
*ip
, uoff_t off
, int size
,
309 enum bi_type alloc_type
, daddr_t
*allocblk
, struct cred
*cr
)
316 int shft
; /* we maintain sh = 1 << shft */
317 daddr_t ob
, nb
, pref
, lbn
, llbn
, tbn
;
319 struct vnode
*vp
= ITOV(ip
);
320 long bsize
= VBSIZE(vp
);
322 int issync
, metaflag
, isdirquota
;
328 struct ufsvfs
*ufsvfsp
;
329 int added_sectors
; /* sectors added to this inode */
330 int alloced_blocks
; /* fs blocks newly allocated */
331 struct ufs_allocated_block undo_table
[NIADDR
+1];
332 int verylargefile
= 0;
334 ASSERT(RW_WRITE_HELD(&ip
->i_contents
));
339 ufsvfsp
= ip
->i_ufsvfs
;
340 fs
= ufsvfsp
->vfs_bufp
->b_un
.b_fs
;
341 lbn
= (daddr_t
)lblkno(fs
, off
);
344 if (ip
->i_blocks
>= VERYLARGEFILESIZE
)
346 llbn
= (daddr_t
)((ip
->i_size
) ? lblkno(fs
, ip
->i_size
- 1) : 0);
347 metaflag
= isdirquota
= 0;
348 if (((ip
->i_mode
& IFMT
) == IFDIR
) ||
349 ((ip
->i_mode
& IFMT
) == IFATTRDIR
))
350 isdirquota
= metaflag
= I_DIR
;
351 else if ((ip
->i_mode
& IFMT
) == IFSHAD
)
353 else if (ip
->i_ufsvfs
->vfs_qinod
== ip
)
354 isdirquota
= metaflag
= I_QUOTA
;
356 issync
= ((ip
->i_flag
& ISYNC
) != 0);
358 if (isdirquota
|| issync
) {
359 alloc_type
= BI_NORMAL
; /* make sure */
363 * If the next write will extend the file into a new block,
364 * and the file is currently composed of a fragment
365 * this fragment has to be extended to be a full block.
367 if (llbn
< NDADDR
&& llbn
< lbn
&& (ob
= ip
->i_db
[llbn
]) != 0) {
368 osize
= blksize(fs
, ip
, llbn
);
369 if (osize
< bsize
&& osize
> 0) {
371 * Check to see if doing this will make the file too
372 * big. Only check if we are dealing with a very
375 if (verylargefile
== 1) {
376 if (((unsigned)ip
->i_blocks
+
377 btodb(bsize
- osize
)) > INT_MAX
) {
382 * Make sure we have all needed pages setup correctly.
384 * We pass S_OTHER to fbread here because we want
385 * an exclusive lock on the page in question
386 * (see ufs_getpage). I/O to the old block location
387 * may still be in progress and we are about to free
388 * the old block. We don't want anyone else to get
389 * a hold of the old block once we free it until
390 * the I/O is complete.
393 fbread(ITOV(ip
), ((offset_t
)llbn
<< fs
->fs_bshift
),
394 (uint_t
)bsize
, S_OTHER
, &fbp
);
397 pref
= blkpref(ip
, llbn
, (int)llbn
, &ip
->i_db
[0]);
398 err
= realloccg(ip
, ob
, pref
, (int)osize
, (int)bsize
,
402 fbrelse(fbp
, S_OTHER
);
405 ASSERT(!ufs_badblock(ip
, nb
));
408 * Update the inode before releasing the
409 * lock on the page. If we released the page
410 * lock first, the data could be written to it's
411 * old address and then destroyed.
413 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 0);
415 UFS_SET_ISIZE(((uoff_t
)(llbn
+ 1)) << fs
->fs_bshift
,
417 ip
->i_blocks
+= btodb(bsize
- osize
);
418 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
419 TRANS_INODE(ufsvfsp
, ip
);
420 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
422 /* Caller is responsible for updating i_seq */
424 * Don't check metaflag here, directories won't do this
428 (void) ufs_fbiwrite(fbp
, ip
, nb
, fs
->fs_fsize
);
431 fbrelse(fbp
, S_WRITE
);
435 (void) free(ip
, ob
, (off_t
)osize
, metaflag
);
441 * The first NDADDR blocks are direct blocks.
446 ip
->i_size
< ((uoff_t
)(lbn
+ 1)) << fs
->fs_bshift
) {
448 /* consider need to reallocate a frag */
449 osize
= fragroundup(fs
, blkoff(fs
, ip
->i_size
));
450 nsize
= fragroundup(fs
, size
);
454 * Check to see if doing this will make the
455 * file too big. Only check if we are dealing
456 * with a very large file.
458 if (verylargefile
== 1) {
459 if (((unsigned)ip
->i_blocks
+
460 btodb(nsize
- osize
)) > INT_MAX
) {
465 * need to re-allocate a block or frag
468 pref
= blkpref(ip
, lbn
, (int)lbn
,
470 err
= realloccg(ip
, ob
, pref
, (int)osize
,
471 (int)nsize
, &nb
, cr
);
476 ASSERT(!ufs_badblock(ip
, nb
));
480 * need to allocate a block or frag
484 ((uoff_t
)(lbn
+ 1)) << fs
->fs_bshift
)
485 nsize
= fragroundup(fs
, size
);
489 * Check to see if doing this will make the
490 * file too big. Only check if we are dealing
491 * with a very large file.
493 if (verylargefile
== 1) {
494 if (((unsigned)ip
->i_blocks
+
495 btodb(nsize
- osize
)) > INT_MAX
) {
499 pref
= blkpref(ip
, lbn
, (int)lbn
, &ip
->i_db
[0]);
500 err
= alloc(ip
, pref
, (int)nsize
, &nb
, cr
);
505 ASSERT(!ufs_badblock(ip
, nb
));
510 * Read old/create new zero pages
515 * mmap S_WRITE faults always enter here
518 * We zero it if its also BI_FALLOCATE, but
519 * only for direct blocks!
521 if (alloc_type
== BI_NORMAL
||
522 alloc_type
== BI_FALLOCATE
||
523 P2ROUNDUP_TYPED(size
,
524 PAGESIZE
, uoff_t
) < nsize
) {
525 /* fbzero doesn't cause a pagefault */
527 ((offset_t
)lbn
<< fs
->fs_bshift
),
528 (uint_t
)nsize
, &fbp
);
532 ((offset_t
)lbn
<< fs
->fs_bshift
),
533 (uint_t
)nsize
, S_OTHER
, &fbp
);
537 (off_t
)nsize
, metaflag
);
540 ob
+ numfrags(fs
, osize
),
541 (off_t
)(nsize
- osize
),
544 ASSERT(nsize
>= osize
);
546 -(long)btodb(nsize
- osize
),
547 0, cr
, (char **)NULL
, NULL
);
551 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, nsize
, 0);
553 ip
->i_blocks
+= btodb(nsize
- osize
);
554 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
555 TRANS_INODE(ufsvfsp
, ip
);
556 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
558 /* Caller is responsible for updating i_seq */
561 * Write directory and shadow blocks synchronously so
562 * that they never appear with garbage in them on the
566 if (isdirquota
&& (ip
->i_size
||
567 TRANS_ISTRANS(ufsvfsp
))) {
569 * XXX man not be necessary with harpy trans
572 (void) ufs_fbiwrite(fbp
, ip
, nb
, fs
->fs_fsize
);
574 fbrelse(fbp
, S_WRITE
);
578 (void) free(ip
, ob
, (off_t
)osize
, metaflag
);
584 added_sectors
= alloced_blocks
= 0; /* No blocks alloced yet */
587 * Determine how many levels of indirection.
589 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
590 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
592 shft
= 0; /* sh = 1 */
594 for (j
= NIADDR
; j
> 0; j
--) {
597 shft
+= nindirshift
; /* sh *= nindir */
608 * Fetch the first indirect block.
611 nb
= ip
->i_ib
[NIADDR
- j
];
614 * Check to see if doing this will make the
615 * file too big. Only check if we are dealing
616 * with a very large file.
618 if (verylargefile
== 1) {
619 if (((unsigned)ip
->i_blocks
+ btodb(bsize
))
625 * Need to allocate an indirect block.
627 pref
= blkpref(ip
, lbn
, 0, (daddr32_t
*)0);
628 err
= alloc(ip
, pref
, (int)bsize
, &nb
, cr
);
631 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 1);
632 ASSERT(!ufs_badblock(ip
, nb
));
635 * Keep track of this allocation so we can undo it if we
636 * get an error later.
639 ASSERT(alloced_blocks
<= NIADDR
);
641 undo_table
[alloced_blocks
].this_block
= nb
;
642 undo_table
[alloced_blocks
].block_size
= bsize
;
643 undo_table
[alloced_blocks
].owner
= ufs_no_owner
;
644 undo_table
[alloced_blocks
].usage_flags
= metaflag
| I_IBLK
;
649 * Write zero block synchronously so that
650 * indirect blocks never point at garbage.
652 bp
= UFS_GETBLK(ufsvfsp
, dev
, fsbtodb(fs
, nb
), bsize
);
655 /* XXX Maybe special-case this? */
656 TRANS_BUF(ufsvfsp
, 0, bsize
, bp
, DT_ABZERO
);
657 UFS_BWRITE2(ufsvfsp
, bp
);
658 if (bp
->b_flags
& B_ERROR
) {
661 ufs_undo_allocation(ip
, alloced_blocks
,
662 undo_table
, added_sectors
);
667 ip
->i_ib
[NIADDR
- j
] = nb
;
668 added_sectors
+= btodb(bsize
);
669 ip
->i_blocks
+= btodb(bsize
);
670 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
671 TRANS_INODE(ufsvfsp
, ip
);
672 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
673 /* Caller is responsible for updating i_seq */
676 * Update the 'undo table' now that we've linked this block
680 undo_table
[alloced_blocks
-1].owner
= ufs_inode_indirect
;
681 undo_table
[alloced_blocks
-1].owner_offset
= NIADDR
- j
;
684 * In the ISYNC case, wrip will notice that the block
685 * count on the inode has changed and will be sure to
686 * ufs_iupdat the inode at the end of wrip.
691 * Fetch through the indirect blocks.
693 for (; j
<= NIADDR
; j
++) {
695 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, fsbtodb(fs
, ob
), bsize
);
697 if (bp
->b_flags
& B_ERROR
) {
701 * Return any partial allocations.
703 * It is possible that we have not yet made any
704 * allocations at this point (if this is the first
705 * pass through the loop and we didn't have to
706 * allocate the first indirect block, above).
707 * In this case, alloced_blocks and added_sectors will
708 * be zero, and ufs_undo_allocation will do nothing.
710 ufs_undo_allocation(ip
, alloced_blocks
,
711 undo_table
, added_sectors
);
714 bap
= bp
->b_un
.b_daddr
;
715 shft
-= nindirshift
; /* sh /= nindir */
716 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
721 * Check to see if doing this will make the
722 * file too big. Only check if we are dealing
723 * with a very large file.
725 if (verylargefile
== 1) {
726 if (((unsigned)ip
->i_blocks
+ btodb(bsize
))
729 ufs_undo_allocation(ip
, alloced_blocks
,
730 undo_table
, added_sectors
);
737 pref
= blkpref(ip
, lbn
, 0,
741 pref
= blkpref(ip
, lbn
, i
, &bap
[0]);
746 * release "bp" buf to avoid deadlock (re-bread later)
750 err
= alloc(ip
, pref
, (int)bsize
, &nb
, cr
);
753 * Return any partial allocations.
755 ufs_undo_allocation(ip
, alloced_blocks
,
756 undo_table
, added_sectors
);
760 ASSERT(!ufs_badblock(ip
, nb
));
761 ASSERT(alloced_blocks
<= NIADDR
);
766 undo_table
[alloced_blocks
].this_block
= nb
;
767 undo_table
[alloced_blocks
].block_size
= bsize
;
768 undo_table
[alloced_blocks
].owner
= ufs_no_owner
;
769 undo_table
[alloced_blocks
].usage_flags
= metaflag
|
770 ((j
< NIADDR
) ? I_IBLK
: 0);
775 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 1);
777 * Write synchronously so indirect
778 * blocks never point at garbage.
781 ufsvfsp
, dev
, fsbtodb(fs
, nb
), bsize
);
784 /* XXX Maybe special-case this? */
785 TRANS_BUF(ufsvfsp
, 0, bsize
, nbp
, DT_ABZERO
);
786 UFS_BWRITE2(ufsvfsp
, nbp
);
787 if (nbp
->b_flags
& B_ERROR
) {
794 ufs_undo_allocation(ip
,
796 undo_table
, added_sectors
);
800 } else if (alloc_type
== BI_NORMAL
||
801 P2ROUNDUP_TYPED(size
,
802 PAGESIZE
, uoff_t
) < bsize
) {
803 TRANS_MATA_ALLOC(ufsvfsp
, ip
, nb
, bsize
, 0);
805 ((offset_t
)lbn
<< fs
->fs_bshift
),
806 (uint_t
)bsize
, &fbp
);
809 * Cases which we need to do a synchronous
810 * write of the zeroed data pages:
812 * 1) If we are writing a directory then we
813 * want to write synchronously so blocks in
814 * directories never contain garbage.
816 * 2) If we are filling in a hole and the
817 * indirect block is going to be synchronously
818 * written back below we need to make sure
819 * that the zeroes are written here before
820 * the indirect block is updated so that if
821 * we crash before the real data is pushed
822 * we will not end up with random data is
823 * the middle of the file.
825 * 3) If the size of the request rounded up
826 * to the system page size is smaller than
827 * the file system block size, we want to
828 * write out all the pages now so that
829 * they are not aborted before they actually
830 * make it to ufs_putpage since the length
831 * of the inode will not include the pages.
834 if (isdirquota
|| (issync
&&
836 (void) ufs_fbiwrite(fbp
, ip
, nb
,
839 fbrelse(fbp
, S_WRITE
);
843 * re-acquire "bp" buf
845 bp
= UFS_BREAD(ufsvfsp
,
846 ip
->i_dev
, fsbtodb(fs
, ob
), bsize
);
847 if (bp
->b_flags
& B_ERROR
) {
851 * Return any partial allocations.
853 ufs_undo_allocation(ip
,
855 undo_table
, added_sectors
);
858 bap
= bp
->b_un
.b_daddr
;
862 * The magic explained: j will be equal to NIADDR
863 * when we are at the lowest level, this is where the
864 * array entries point directly to data blocks. Since
865 * we will be 'fallocate'ing we will go ahead and negate
868 if (alloc_type
== BI_FALLOCATE
&& j
== NIADDR
)
871 TRANS_BUF_ITEM_128(ufsvfsp
, bap
[i
], bap
, bp
, DT_AB
);
872 added_sectors
+= btodb(bsize
);
873 ip
->i_blocks
+= btodb(bsize
);
874 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
875 TRANS_INODE(ufsvfsp
, ip
);
876 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
878 /* Caller is responsible for updating i_seq */
880 undo_table
[alloced_blocks
-1].owner
=
882 undo_table
[alloced_blocks
-1].owner_block
= ob
;
883 undo_table
[alloced_blocks
-1].owner_offset
= i
;
886 UFS_BWRITE2(ufsvfsp
, bp
);
887 if (bp
->b_flags
& B_ERROR
) {
894 ufs_undo_allocation(ip
,
896 undo_table
, added_sectors
);
911 * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
912 * is in the critical region of wrip().
915 bmap_has_holes(struct inode
*ip
)
917 struct fs
*fs
= ip
->i_fs
;
918 uint_t dblks
; /* # of data blocks */
919 uint_t mblks
; /* # of data + metadata blocks */
926 int fsbshift
= fs
->fs_bshift
;
927 int fsboffset
= (1 << fsbshift
) - 1;
930 * Check for writer in critical region, if found then we
931 * cannot trust the values of i_size and i_blocks
932 * simply return true.
934 if (ip
->i_writer
!= NULL
&& ip
->i_writer
!= curthread
) {
938 dblks
= (ip
->i_size
+ fsboffset
) >> fsbshift
;
939 mblks
= (ldbtob((uoff_t
)ip
->i_blocks
) + fsboffset
) >> fsbshift
;
942 * File has only direct blocks.
945 return (mblks
< dblks
);
946 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
948 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
949 nindirblks
= nindiroffset
+ 1;
954 * Determine how many levels of indirection.
956 for (j
= NIADDR
; j
> 0; j
--) {
959 shft
+= nindirshift
; /* sh *= nindir */
965 /* LINTED: warning: logical expression always true: op "||" */
968 if (j
== NIADDR
) /* single level indirection */
969 cnt
= NDADDR
+ 1 + dblks
;
970 else if (j
== NIADDR
-1) /* double indirection */
971 cnt
= NDADDR
+ 1 + nindirblks
+
972 1 + (dblks
+ nindiroffset
)/nindirblks
+ dblks
;
973 else if (j
== NIADDR
-2) { /* triple indirection */
974 n
= (dblks
+ nindiroffset
)/nindirblks
;
975 cnt
= NDADDR
+ 1 + nindirblks
+
976 1 + nindirblks
+ nindirblks
*nindirblks
+
977 1 + (n
+ nindiroffset
)/nindirblks
+ n
+ dblks
;
980 return (mblks
< cnt
);
984 * find some contig blocks starting at *sbp and going for min(n, max_contig)
985 * return the number of blocks (not frags) found.
986 * The array passed in must be at least [0..n-1].
989 findextent(struct fs
*fs
, daddr32_t
*sbp
, int n
, int *lenp
, int maxtransfer
)
991 register daddr_t bn
, nextbn
;
992 register daddr32_t
*bp
;
1004 n
= MIN(n
, lblkno(fs
, *lenp
));
1007 * If the user has set the value for maxcontig lower than
1008 * the drive transfer size, then assume they want this
1009 * to be the maximum value for the size of the data transfer.
1011 maxtransblk
= maxtransfer
>> DEV_BSHIFT
;
1012 if (fs
->fs_maxcontig
< maxtransblk
) {
1013 n
= MIN(n
, fs
->fs_maxcontig
);
1015 n
= MIN(n
, maxtransblk
);
1021 if (nextbn
== 0 || bn
+ diff
!= nextbn
)
1026 return ((int)(bp
- sbp
) + 1);
1030 * Free any blocks which had been successfully allocated. Always called
1031 * as a result of an error, so we don't bother returning an error code
1034 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1035 * Thus it is safe to call this as part of error handling, whether or not
1036 * any blocks have been allocated.
1038 * The ufs_inode_direct case is currently unused.
1042 ufs_undo_allocation(
1045 struct ufs_allocated_block table
[],
1046 int inode_sector_adjust
)
1050 int error_updating_pointers
;
1051 struct ufsvfs
*ufsvfsp
;
1054 error_updating_pointers
= 0;
1056 ufsvfsp
= ip
->i_ufsvfs
;
1059 * Update pointers on disk before freeing blocks. If we fail,
1060 * some blocks may remain busy; but they will be reclaimed by
1061 * an fsck. (This is better than letting a block wind up with
1062 * two owners if we successfully freed it but could not remove
1063 * the pointer to it.)
1066 for (i
= 0; i
< block_count
; i
++) {
1067 switch (table
[i
].owner
) {
1069 /* Nothing to do here, nobody points to us */
1071 case ufs_inode_direct
:
1072 ASSERT(table
[i
].owner_offset
< NDADDR
);
1073 ip
->i_db
[table
[i
].owner_offset
] = 0;
1076 case ufs_inode_indirect
:
1077 ASSERT(table
[i
].owner_offset
< NIADDR
);
1078 ip
->i_ib
[table
[i
].owner_offset
] = 0;
1081 case ufs_indirect_block
: {
1083 daddr32_t
*block_data
;
1085 /* Read/modify/log/write. */
1087 ASSERT(table
[i
].owner_offset
<
1088 (VBSIZE(ITOV(ip
)) / sizeof (daddr32_t
)));
1090 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
,
1091 fsbtodb(ufsvfsp
->vfs_fs
, table
[i
].owner_block
),
1094 if (bp
->b_flags
& B_ERROR
) {
1095 /* Couldn't read this block; give up. */
1096 error_updating_pointers
= 1;
1098 break; /* out of SWITCH */
1101 block_data
= bp
->b_un
.b_daddr
;
1102 block_data
[table
[i
].owner_offset
] = 0;
1104 /* Write a log entry which includes the zero. */
1105 /* It might be possible to optimize this by using */
1106 /* TRANS_BUF directly and zeroing only the four */
1107 /* bytes involved, but an attempt to do that led */
1108 /* to panics in the logging code. The attempt was */
1109 /* TRANS_BUF(ufsvfsp, */
1110 /* table[i].owner_offset * sizeof (daddr32_t), */
1111 /* sizeof (daddr32_t), */
1115 TRANS_BUF_ITEM_128(ufsvfsp
,
1116 block_data
[table
[i
].owner_offset
],
1117 block_data
, bp
, DT_AB
);
1119 /* Now we can write the buffer itself. */
1121 UFS_BWRITE2(ufsvfsp
, bp
);
1123 if (bp
->b_flags
& B_ERROR
) {
1124 error_updating_pointers
= 1;
1131 (void) ufs_fault(ITOV(ip
),
1132 "ufs_undo_allocation failure\n");
1138 * If the inode changed, or if we need to update its block count,
1139 * then do that now. We update the inode synchronously on disk
1140 * to ensure that it won't transiently point at a block we've
1141 * freed (only necessary if we're not logging).
1143 * NOTE: Currently ufs_iupdat() does not check for errors. When
1144 * it is fixed, we should verify that we successfully updated the
1145 * inode before freeing blocks below.
1148 if (inode_changed
|| (inode_sector_adjust
!= 0)) {
1149 ip
->i_blocks
-= inode_sector_adjust
;
1150 ASSERT((unsigned)ip
->i_blocks
<= INT_MAX
);
1151 TRANS_INODE(ufsvfsp
, ip
);
1152 ip
->i_flag
|= IUPD
| ICHG
| IATTCHG
;
1154 if (!TRANS_ISTRANS(ufsvfsp
))
1155 ufs_iupdat(ip
, I_SYNC
);
1159 * Now we go through and actually free the blocks, but only if we
1160 * successfully removed the pointers to them.
1163 if (!error_updating_pointers
) {
1164 for (i
= 0; i
< block_count
; i
++) {
1165 free(ip
, table
[i
].this_block
, table
[i
].block_size
,
1166 table
[i
].usage_flags
);
1172 * Find the next hole or data block in file starting at *off
1173 * Return found offset in *off, which can be less than the
1174 * starting offset if not block aligned.
1175 * This code is based on bmap_read().
1176 * Errors: ENXIO for end of file
1177 * EIO for block read error.
1180 bmap_find(struct inode
*ip
, boolean_t hole
, uoff_t
*off
)
1182 ufsvfs_t
*ufsvfsp
= ip
->i_ufsvfs
;
1183 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1186 int shft
; /* we maintain sh = 1 << shft */
1187 int nindirshift
, nindiroffset
;
1188 daddr_t ob
, nb
, tbn
, lbn
, skip
;
1190 uoff_t isz
= (offset_t
)ip
->i_size
;
1191 int32_t bs
= fs
->fs_bsize
; /* file system block size */
1192 int32_t nindir
= fs
->fs_nindir
;
1195 daddr_t limits
[NIADDR
];
1198 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
1199 lbn
= (daddr_t
)lblkno(fs
, *off
);
1202 for (i
= 0; i
< NIADDR
; i
++)
1206 * The first NDADDR blocks are direct blocks.
1209 for (; lbn
< NDADDR
; lbn
++) {
1210 if ((hole
&& (ip
->i_db
[lbn
] == 0)) ||
1211 (!hole
&& (ip
->i_db
[lbn
] != 0))) {
1215 if ((uoff_t
)lbn
<< fs
->fs_bshift
>= isz
)
1219 nindir
= fs
->fs_nindir
;
1220 nindirshift
= ufsvfsp
->vfs_nindirshift
;
1221 nindiroffset
= ufsvfsp
->vfs_nindiroffset
;
1224 /* Set up limits array */
1225 for (limits
[0] = NDADDR
, j
= 1; j
< NIADDR
; j
++)
1226 limits
[j
] = limits
[j
-1] + (1ULL << (nindirshift
* j
));
1230 * Determine how many levels of indirection.
1232 shft
= 0; /* sh = 1 */
1234 for (j
= NIADDR
; j
> 0; j
--) {
1237 shft
+= nindirshift
; /* sh *= nindir */
1244 /* must have passed end of file */
1245 ASSERT(((uoff_t
)lbn
<< fs
->fs_bshift
) >= isz
);
1250 * Fetch the first indirect block.
1252 nb
= ip
->i_ib
[NIADDR
- j
];
1255 lbn
= limits
[NIADDR
- j
];
1258 lbn
= limits
[NIADDR
- j
+ 1];
1259 if ((uoff_t
)lbn
<< fs
->fs_bshift
>= isz
)
1266 * Fetch through the indirect blocks.
1268 for (; ((j
<= NIADDR
) && (nb
!= 0)); j
++) {
1271 * if there's a different block at this level then release
1272 * the old one and in with the new.
1274 if ((bp
[j
-1] == NULL
) || bp
[j
-1]->b_blkno
!= fsbtodb(fs
, ob
)) {
1275 if (bp
[j
-1] != NULL
)
1277 bp
[j
-1] = UFS_BREAD(ufsvfsp
, dev
, fsbtodb(fs
, ob
), bs
);
1278 if (bp
[j
-1]->b_flags
& B_ERROR
) {
1283 bap
= bp
[j
-1]->b_un
.b_daddr
;
1285 shft
-= nindirshift
; /* sh / nindir */
1286 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
1288 skip
= 1LL << (nindirshift
* (NIADDR
- j
));
1292 * Scan through the blocks in this array.
1294 for (; i
< nindir
; i
++, lbn
+= skip
) {
1295 if (hole
&& (bap
[i
] == 0))
1297 if (!hole
&& (bap
[i
] != 0)) {
1299 /* we're at the lowest level */
1306 if (((uoff_t
)lbn
<< fs
->fs_bshift
) < isz
)
1309 for (i
= 0; i
< NIADDR
; i
++) {
1314 if (((uoff_t
)lbn
<< fs
->fs_bshift
) >= isz
) {
1318 *off
= (uoff_t
)lbn
<< fs
->fs_bshift
;
1325 * Set a particular offset in the inode list to be a certain block.
1326 * User is responsible for calling TRANS* functions
1329 bmap_set_bn(struct vnode
*vp
, uoff_t off
, daddr32_t bn
)
1337 int shft
; /* we maintain sh = 1 << shft */
1339 daddr_t ob
, nb
, tbn
;
1341 int nindirshift
, nindiroffset
;
1344 ufsvfsp
= ip
->i_ufsvfs
;
1345 fs
= ufsvfsp
->vfs_fs
;
1346 lbn
= (daddr_t
)lblkno(fs
, off
);
1348 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
1354 * Take care of direct block assignment
1361 nindirshift
= ip
->i_ufsvfs
->vfs_nindirshift
;
1362 nindiroffset
= ip
->i_ufsvfs
->vfs_nindiroffset
;
1364 * Determine how many levels of indirection.
1366 shft
= 0; /* sh = 1 */
1368 for (j
= NIADDR
; j
> 0; j
--) {
1371 shft
+= nindirshift
; /* sh *= nindir */
1381 * Fetch the first indirect block.
1383 nb
= ip
->i_ib
[NIADDR
- j
];
1385 err
= ufs_fault(ITOV(ip
), "ufs_set_bn: nb == UFS_HOLE");
1390 * Fetch through the indirect blocks.
1392 for (; j
<= NIADDR
; j
++) {
1394 bp
= UFS_BREAD(ufsvfsp
,
1395 ip
->i_dev
, fsbtodb(fs
, ob
), fs
->fs_bsize
);
1396 if (bp
->b_flags
& B_ERROR
) {
1401 bap
= bp
->b_un
.b_daddr
;
1403 ASSERT(!ufs_indir_badblock(ip
, bap
));
1405 shft
-= nindirshift
; /* sh / nindir */
1406 i
= (tbn
>> shft
) & nindiroffset
; /* (tbn / sh) % nindir */
1410 err
= ufs_fault(ITOV(ip
), "ufs_set_bn: nb == UFS_HOLE");