dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / ufs / ufs_bmap.c
blob95550c9fd9ced3cc13f5337ccaeeea5ab794f64a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/signal.h>
44 #include <sys/user.h>
45 #include <sys/vnode.h>
46 #include <sys/buf.h>
47 #include <sys/disp.h>
48 #include <sys/proc.h>
49 #include <sys/conf.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_trans.h>
54 #include <sys/fs/ufs_bio.h>
55 #include <vm/seg.h>
56 #include <sys/errno.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vfs.h>
59 #include <sys/debug.h>
60 #include <sys/kmem.h>
61 #include <sys/cmn_err.h>
64 * This structure is used to track blocks as we allocate them, so that
65 * we can free them if we encounter an error during allocation. We
66 * keep track of five pieces of information for each allocated block:
67 * - The number of the newly allocated block
68 * - The size of the block (lets us deal with fragments if we want)
69 * - The number of the block containing a pointer to it; or whether
70 * the pointer is in the inode
71 * - The offset within the block (or inode) containing a pointer to it.
72 * - A flag indicating the usage of the block. (Logging needs to know
73 * this to avoid overwriting a data block if it was previously used
74 * for metadata.)
77 enum ufs_owner_type {
78 ufs_no_owner, /* Owner has not yet been updated */
79 ufs_inode_direct, /* Listed in inode's direct block table */
80 ufs_inode_indirect, /* Listed in inode's indirect block table */
81 ufs_indirect_block /* Listed in an indirect block */
84 struct ufs_allocated_block {
85 daddr_t this_block; /* Number of this block */
86 off_t block_size; /* Size of this block, in bytes */
87 enum ufs_owner_type owner; /* Who points to this block? */
88 daddr_t owner_block; /* Number of the owning block */
89 uint_t owner_offset; /* Offset within that block or inode */
90 int usage_flags; /* Usage flags, as expected by free() */
94 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
95 int maxtrans);
97 static void ufs_undo_allocation(inode_t *ip, int block_count,
98 struct ufs_allocated_block table[], int inode_sector_adjust);
101 * Find the extent and the matching block number.
103 * bsize > PAGESIZE
104 * boff indicates that we want a page in the middle
105 * min expression is supposed to make sure no extra page[s] after EOF
106 * PAGESIZE >= bsize
107 * we assume that a page is a multiple of bsize, i.e.,
108 * boff always == 0
110 * We always return a length that is suitable for a disk transfer.
112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
113 register daddr32_t *dp = (tblp); \
114 register int _chkfrag = chkfrag; /* for lint. sigh */ \
116 if (*dp == 0) { \
117 *(bnp) = UFS_HOLE; \
118 } else { \
119 register int len; \
121 len = findextent(fs, dp, (int)(n), lenp, maxtrans) << \
122 (fs)->fs_bshift; \
123 if (_chkfrag) { \
124 register uoff_t tmp; \
126 tmp = fragroundup((fs), size) - \
127 (((uoff_t)lbn) << fs->fs_bshift); \
128 len = (int)MIN(tmp, len); \
130 len -= (boff); \
131 if (len <= 0) { \
132 *(bnp) = UFS_HOLE; \
133 } else { \
134 *(bnp) = fsbtodb(fs, *dp) + btodb(boff); \
135 *(lenp) = len; \
141 * The maximum supported file size is actually somewhat less that 1
142 * terabyte. This is because the total number of blocks used for the
143 * file and its metadata must fit into the ic_blocks field of the
144 * inode, which is a signed 32-bit quantity. The metadata allocated
145 * for a file (that is, the single, double, and triple indirect blocks
146 * used to reference the file blocks) is actually quite small,
147 * but just to make sure, we check for overflow in the ic_blocks
148 * ic_blocks fields for all files whose total block count is
149 * within 1 GB of a terabyte. VERYLARGEFILESIZE below is the number of
150 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
151 * in a gigabyte (2^21). We only check for overflow in the ic_blocks
152 * field if the number of blocks currently allocated to the file is
153 * greater than VERYLARGEFILESIZE.
155 * Note that file "size" is the not the same as file "length". A
156 * file's "size" is the number of blocks allocated to it. A file's
157 * "length" is the maximum offset in the file. A UFS FILE can have a
158 * length of a terabyte, but the size is limited to somewhat less than
159 * a terabyte, as described above.
161 #define VERYLARGEFILESIZE 0x7FE00000
164 * bmap{read,write} define the structure of file system storage by mapping
165 * a logical offset in a file to a physical block number on the device.
166 * It should be called with a locked inode when allocation is to be
167 * done (bmap_write). Note this strangeness: bmap_write is always called from
168 * getpage(), not putpage(), since getpage() is where all the allocation
169 * is done.
171 * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
173 * NOTICE: the block number returned is the disk block number, not the
174 * file system block number. All the worries about block offsets and
175 * page/block sizes are hidden inside of bmap. Well, not quite,
176 * unfortunately. It's impossible to find one place to hide all this
177 * mess. There are 3 cases:
179 * PAGESIZE < bsize
180 * In this case, the {get,put}page routines will attempt to align to
181 * a file system block boundry (XXX - maybe this is a mistake?). Since
182 * the kluster routines may be out of memory, we don't always get all
183 * the pages we wanted. If we called bmap first, to find out how much
184 * to kluster, we handed in the block aligned offset. If we didn't get
185 * all the pages, we have to chop off the amount we didn't get from the
186 * amount handed back by bmap.
188 * PAGESIZE == bsize
189 * Life is quite pleasant here, no extra work needed, mainly because we
190 * (probably?) won't kluster backwards, just forwards.
192 * PAGESIZE > bsize
193 * This one has a different set of problems, specifically, we may have to
194 * do N reads to fill one page. Let us hope that Sun will stay with small
195 * pages.
197 * Returns 0 on success, or a non-zero errno if an error occurs.
199 * TODO
200 * LMXXX - add a bmap cache. This could be a couple of extents in the
201 * inode. Two is nice for PAGESIZE > bsize.
205 bmap_read(struct inode *ip, uoff_t off, daddr_t *bnp, int *lenp)
207 daddr_t lbn;
208 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
209 struct fs *fs = ufsvfsp->vfs_fs;
210 struct buf *bp;
211 int i, j, boff;
212 int shft; /* we maintain sh = 1 << shft */
213 daddr_t ob, nb, tbn;
214 daddr32_t *bap;
215 int nindirshift, nindiroffset;
217 ASSERT(RW_LOCK_HELD(&ip->i_contents));
218 lbn = (daddr_t)lblkno(fs, off);
219 boff = (int)blkoff(fs, off);
220 if (lbn < 0)
221 return (EFBIG);
224 * The first NDADDR blocks are direct blocks.
226 if (lbn < NDADDR) {
227 DOEXTENT(fs, lbn, boff, bnp, lenp,
228 ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
229 ufsvfsp->vfs_iotransz);
230 return (0);
233 nindirshift = ufsvfsp->vfs_nindirshift;
234 nindiroffset = ufsvfsp->vfs_nindiroffset;
236 * Determine how many levels of indirection.
238 shft = 0; /* sh = 1 */
239 tbn = lbn - NDADDR;
240 for (j = NIADDR; j > 0; j--) {
241 longlong_t sh;
243 shft += nindirshift; /* sh *= nindir */
244 sh = 1LL << shft;
245 if (tbn < sh)
246 break;
247 tbn -= sh;
249 if (j == 0)
250 return (EFBIG);
253 * Fetch the first indirect block.
255 nb = ip->i_ib[NIADDR - j];
256 if (nb == 0) {
257 *bnp = UFS_HOLE;
258 return (0);
262 * Fetch through the indirect blocks.
264 for (; j <= NIADDR; j++) {
265 ob = nb;
266 bp = UFS_BREAD(ufsvfsp,
267 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
268 if (bp->b_flags & B_ERROR) {
269 brelse(bp);
270 return (EIO);
272 bap = bp->b_un.b_daddr;
274 ASSERT(!ufs_indir_badblock(ip, bap));
276 shft -= nindirshift; /* sh / nindir */
277 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
278 nb = bap[i];
279 if (nb == 0) {
280 *bnp = UFS_HOLE;
281 brelse(bp);
282 return (0);
284 if (j != NIADDR)
285 brelse(bp);
287 DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
288 MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
289 0, ufsvfsp->vfs_iotransz);
290 brelse(bp);
291 return (0);
295 * See bmap_read for general notes.
297 * The block must be at least size bytes and will be extended or
298 * allocated as needed. If alloc_type is of type BI_ALLOC_ONLY, then bmap
299 * will not create any in-core pages that correspond to the new disk allocation.
300 * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
301 * and security is maintained b/c upon reading a negative block number pages
302 * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
303 * be created and initialized as needed.
305 * Returns 0 on success, or a non-zero errno if an error occurs.
308 bmap_write(struct inode *ip, uoff_t off, int size,
309 enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
311 struct fs *fs;
312 struct buf *bp;
313 int i;
314 struct buf *nbp;
315 int j;
316 int shft; /* we maintain sh = 1 << shft */
317 daddr_t ob, nb, pref, lbn, llbn, tbn;
318 daddr32_t *bap;
319 struct vnode *vp = ITOV(ip);
320 long bsize = VBSIZE(vp);
321 long osize, nsize;
322 int issync, metaflag, isdirquota;
323 int err;
324 dev_t dev;
325 struct fbuf *fbp;
326 int nindirshift;
327 int nindiroffset;
328 struct ufsvfs *ufsvfsp;
329 int added_sectors; /* sectors added to this inode */
330 int alloced_blocks; /* fs blocks newly allocated */
331 struct ufs_allocated_block undo_table[NIADDR+1];
332 int verylargefile = 0;
334 ASSERT(RW_WRITE_HELD(&ip->i_contents));
336 if (allocblk)
337 *allocblk = 0;
339 ufsvfsp = ip->i_ufsvfs;
340 fs = ufsvfsp->vfs_bufp->b_un.b_fs;
341 lbn = (daddr_t)lblkno(fs, off);
342 if (lbn < 0)
343 return (EFBIG);
344 if (ip->i_blocks >= VERYLARGEFILESIZE)
345 verylargefile = 1;
346 llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
347 metaflag = isdirquota = 0;
348 if (((ip->i_mode & IFMT) == IFDIR) ||
349 ((ip->i_mode & IFMT) == IFATTRDIR))
350 isdirquota = metaflag = I_DIR;
351 else if ((ip->i_mode & IFMT) == IFSHAD)
352 metaflag = I_SHAD;
353 else if (ip->i_ufsvfs->vfs_qinod == ip)
354 isdirquota = metaflag = I_QUOTA;
356 issync = ((ip->i_flag & ISYNC) != 0);
358 if (isdirquota || issync) {
359 alloc_type = BI_NORMAL; /* make sure */
363 * If the next write will extend the file into a new block,
364 * and the file is currently composed of a fragment
365 * this fragment has to be extended to be a full block.
367 if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
368 osize = blksize(fs, ip, llbn);
369 if (osize < bsize && osize > 0) {
371 * Check to see if doing this will make the file too
372 * big. Only check if we are dealing with a very
373 * large file.
375 if (verylargefile == 1) {
376 if (((unsigned)ip->i_blocks +
377 btodb(bsize - osize)) > INT_MAX) {
378 return (EFBIG);
382 * Make sure we have all needed pages setup correctly.
384 * We pass S_OTHER to fbread here because we want
385 * an exclusive lock on the page in question
386 * (see ufs_getpage). I/O to the old block location
387 * may still be in progress and we are about to free
388 * the old block. We don't want anyone else to get
389 * a hold of the old block once we free it until
390 * the I/O is complete.
392 err =
393 fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
394 (uint_t)bsize, S_OTHER, &fbp);
395 if (err)
396 return (err);
397 pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
398 err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
399 &nb, cr);
400 if (err) {
401 if (fbp)
402 fbrelse(fbp, S_OTHER);
403 return (err);
405 ASSERT(!ufs_badblock(ip, nb));
408 * Update the inode before releasing the
409 * lock on the page. If we released the page
410 * lock first, the data could be written to it's
411 * old address and then destroyed.
413 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
414 ip->i_db[llbn] = nb;
415 UFS_SET_ISIZE(((uoff_t)(llbn + 1)) << fs->fs_bshift,
416 ip);
417 ip->i_blocks += btodb(bsize - osize);
418 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
419 TRANS_INODE(ufsvfsp, ip);
420 ip->i_flag |= IUPD | ICHG | IATTCHG;
422 /* Caller is responsible for updating i_seq */
424 * Don't check metaflag here, directories won't do this
427 if (issync) {
428 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
429 } else {
430 ASSERT(fbp);
431 fbrelse(fbp, S_WRITE);
434 if (nb != ob) {
435 (void) free(ip, ob, (off_t)osize, metaflag);
441 * The first NDADDR blocks are direct blocks.
443 if (lbn < NDADDR) {
444 nb = ip->i_db[lbn];
445 if (nb == 0 ||
446 ip->i_size < ((uoff_t)(lbn + 1)) << fs->fs_bshift) {
447 if (nb != 0) {
448 /* consider need to reallocate a frag */
449 osize = fragroundup(fs, blkoff(fs, ip->i_size));
450 nsize = fragroundup(fs, size);
451 if (nsize <= osize)
452 goto gotit;
454 * Check to see if doing this will make the
455 * file too big. Only check if we are dealing
456 * with a very large file.
458 if (verylargefile == 1) {
459 if (((unsigned)ip->i_blocks +
460 btodb(nsize - osize)) > INT_MAX) {
461 return (EFBIG);
465 * need to re-allocate a block or frag
467 ob = nb;
468 pref = blkpref(ip, lbn, (int)lbn,
469 &ip->i_db[0]);
470 err = realloccg(ip, ob, pref, (int)osize,
471 (int)nsize, &nb, cr);
472 if (err)
473 return (err);
474 if (allocblk)
475 *allocblk = nb;
476 ASSERT(!ufs_badblock(ip, nb));
478 } else {
480 * need to allocate a block or frag
482 osize = 0;
483 if (ip->i_size <
484 ((uoff_t)(lbn + 1)) << fs->fs_bshift)
485 nsize = fragroundup(fs, size);
486 else
487 nsize = bsize;
489 * Check to see if doing this will make the
490 * file too big. Only check if we are dealing
491 * with a very large file.
493 if (verylargefile == 1) {
494 if (((unsigned)ip->i_blocks +
495 btodb(nsize - osize)) > INT_MAX) {
496 return (EFBIG);
499 pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
500 err = alloc(ip, pref, (int)nsize, &nb, cr);
501 if (err)
502 return (err);
503 if (allocblk)
504 *allocblk = nb;
505 ASSERT(!ufs_badblock(ip, nb));
506 ob = nb;
510 * Read old/create new zero pages
512 fbp = NULL;
513 if (osize == 0) {
515 * mmap S_WRITE faults always enter here
518 * We zero it if its also BI_FALLOCATE, but
519 * only for direct blocks!
521 if (alloc_type == BI_NORMAL ||
522 alloc_type == BI_FALLOCATE ||
523 P2ROUNDUP_TYPED(size,
524 PAGESIZE, uoff_t) < nsize) {
525 /* fbzero doesn't cause a pagefault */
526 fbzero(ITOV(ip),
527 ((offset_t)lbn << fs->fs_bshift),
528 (uint_t)nsize, &fbp);
530 } else {
531 err = fbread(vp,
532 ((offset_t)lbn << fs->fs_bshift),
533 (uint_t)nsize, S_OTHER, &fbp);
534 if (err) {
535 if (nb != ob) {
536 (void) free(ip, nb,
537 (off_t)nsize, metaflag);
538 } else {
539 (void) free(ip,
540 ob + numfrags(fs, osize),
541 (off_t)(nsize - osize),
542 metaflag);
544 ASSERT(nsize >= osize);
545 (void) chkdq(ip,
546 -(long)btodb(nsize - osize),
547 0, cr, (char **)NULL, NULL);
548 return (err);
551 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
552 ip->i_db[lbn] = nb;
553 ip->i_blocks += btodb(nsize - osize);
554 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
555 TRANS_INODE(ufsvfsp, ip);
556 ip->i_flag |= IUPD | ICHG | IATTCHG;
558 /* Caller is responsible for updating i_seq */
561 * Write directory and shadow blocks synchronously so
562 * that they never appear with garbage in them on the
563 * disk.
566 if (isdirquota && (ip->i_size ||
567 TRANS_ISTRANS(ufsvfsp))) {
569 * XXX man not be necessary with harpy trans
570 * bug id 1130055
572 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
573 } else if (fbp) {
574 fbrelse(fbp, S_WRITE);
577 if (nb != ob)
578 (void) free(ip, ob, (off_t)osize, metaflag);
580 gotit:
581 return (0);
584 added_sectors = alloced_blocks = 0; /* No blocks alloced yet */
587 * Determine how many levels of indirection.
589 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
590 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
591 pref = 0;
592 shft = 0; /* sh = 1 */
593 tbn = lbn - NDADDR;
594 for (j = NIADDR; j > 0; j--) {
595 longlong_t sh;
597 shft += nindirshift; /* sh *= nindir */
598 sh = 1LL << shft;
599 if (tbn < sh)
600 break;
601 tbn -= sh;
604 if (j == 0)
605 return (EFBIG);
608 * Fetch the first indirect block.
610 dev = ip->i_dev;
611 nb = ip->i_ib[NIADDR - j];
612 if (nb == 0) {
614 * Check to see if doing this will make the
615 * file too big. Only check if we are dealing
616 * with a very large file.
618 if (verylargefile == 1) {
619 if (((unsigned)ip->i_blocks + btodb(bsize))
620 > INT_MAX) {
621 return (EFBIG);
625 * Need to allocate an indirect block.
627 pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
628 err = alloc(ip, pref, (int)bsize, &nb, cr);
629 if (err)
630 return (err);
631 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
632 ASSERT(!ufs_badblock(ip, nb));
635 * Keep track of this allocation so we can undo it if we
636 * get an error later.
639 ASSERT(alloced_blocks <= NIADDR);
641 undo_table[alloced_blocks].this_block = nb;
642 undo_table[alloced_blocks].block_size = bsize;
643 undo_table[alloced_blocks].owner = ufs_no_owner;
644 undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
646 alloced_blocks++;
649 * Write zero block synchronously so that
650 * indirect blocks never point at garbage.
652 bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
654 clrbuf(bp);
655 /* XXX Maybe special-case this? */
656 TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
657 UFS_BWRITE2(ufsvfsp, bp);
658 if (bp->b_flags & B_ERROR) {
659 err = geterror(bp);
660 brelse(bp);
661 ufs_undo_allocation(ip, alloced_blocks,
662 undo_table, added_sectors);
663 return (err);
665 brelse(bp);
667 ip->i_ib[NIADDR - j] = nb;
668 added_sectors += btodb(bsize);
669 ip->i_blocks += btodb(bsize);
670 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
671 TRANS_INODE(ufsvfsp, ip);
672 ip->i_flag |= IUPD | ICHG | IATTCHG;
673 /* Caller is responsible for updating i_seq */
676 * Update the 'undo table' now that we've linked this block
677 * to an inode.
680 undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
681 undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
684 * In the ISYNC case, wrip will notice that the block
685 * count on the inode has changed and will be sure to
686 * ufs_iupdat the inode at the end of wrip.
691 * Fetch through the indirect blocks.
693 for (; j <= NIADDR; j++) {
694 ob = nb;
695 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
697 if (bp->b_flags & B_ERROR) {
698 err = geterror(bp);
699 brelse(bp);
701 * Return any partial allocations.
703 * It is possible that we have not yet made any
704 * allocations at this point (if this is the first
705 * pass through the loop and we didn't have to
706 * allocate the first indirect block, above).
707 * In this case, alloced_blocks and added_sectors will
708 * be zero, and ufs_undo_allocation will do nothing.
710 ufs_undo_allocation(ip, alloced_blocks,
711 undo_table, added_sectors);
712 return (err);
714 bap = bp->b_un.b_daddr;
715 shft -= nindirshift; /* sh /= nindir */
716 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
717 nb = bap[i];
719 if (nb == 0) {
721 * Check to see if doing this will make the
722 * file too big. Only check if we are dealing
723 * with a very large file.
725 if (verylargefile == 1) {
726 if (((unsigned)ip->i_blocks + btodb(bsize))
727 > INT_MAX) {
728 brelse(bp);
729 ufs_undo_allocation(ip, alloced_blocks,
730 undo_table, added_sectors);
731 return (EFBIG);
734 if (pref == 0) {
735 if (j < NIADDR) {
736 /* Indirect block */
737 pref = blkpref(ip, lbn, 0,
738 (daddr32_t *)0);
739 } else {
740 /* Data block */
741 pref = blkpref(ip, lbn, i, &bap[0]);
746 * release "bp" buf to avoid deadlock (re-bread later)
748 brelse(bp);
750 err = alloc(ip, pref, (int)bsize, &nb, cr);
751 if (err) {
753 * Return any partial allocations.
755 ufs_undo_allocation(ip, alloced_blocks,
756 undo_table, added_sectors);
757 return (err);
760 ASSERT(!ufs_badblock(ip, nb));
761 ASSERT(alloced_blocks <= NIADDR);
763 if (allocblk)
764 *allocblk = nb;
766 undo_table[alloced_blocks].this_block = nb;
767 undo_table[alloced_blocks].block_size = bsize;
768 undo_table[alloced_blocks].owner = ufs_no_owner;
769 undo_table[alloced_blocks].usage_flags = metaflag |
770 ((j < NIADDR) ? I_IBLK : 0);
772 alloced_blocks++;
774 if (j < NIADDR) {
775 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
777 * Write synchronously so indirect
778 * blocks never point at garbage.
780 nbp = UFS_GETBLK(
781 ufsvfsp, dev, fsbtodb(fs, nb), bsize);
783 clrbuf(nbp);
784 /* XXX Maybe special-case this? */
785 TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
786 UFS_BWRITE2(ufsvfsp, nbp);
787 if (nbp->b_flags & B_ERROR) {
788 err = geterror(nbp);
789 brelse(nbp);
791 * Return any partial
792 * allocations.
794 ufs_undo_allocation(ip,
795 alloced_blocks,
796 undo_table, added_sectors);
797 return (err);
799 brelse(nbp);
800 } else if (alloc_type == BI_NORMAL ||
801 P2ROUNDUP_TYPED(size,
802 PAGESIZE, uoff_t) < bsize) {
803 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
804 fbzero(ITOV(ip),
805 ((offset_t)lbn << fs->fs_bshift),
806 (uint_t)bsize, &fbp);
809 * Cases which we need to do a synchronous
810 * write of the zeroed data pages:
812 * 1) If we are writing a directory then we
813 * want to write synchronously so blocks in
814 * directories never contain garbage.
816 * 2) If we are filling in a hole and the
817 * indirect block is going to be synchronously
818 * written back below we need to make sure
819 * that the zeroes are written here before
820 * the indirect block is updated so that if
821 * we crash before the real data is pushed
822 * we will not end up with random data is
823 * the middle of the file.
825 * 3) If the size of the request rounded up
826 * to the system page size is smaller than
827 * the file system block size, we want to
828 * write out all the pages now so that
829 * they are not aborted before they actually
830 * make it to ufs_putpage since the length
831 * of the inode will not include the pages.
834 if (isdirquota || (issync &&
835 lbn < llbn))
836 (void) ufs_fbiwrite(fbp, ip, nb,
837 fs->fs_fsize);
838 else
839 fbrelse(fbp, S_WRITE);
843 * re-acquire "bp" buf
845 bp = UFS_BREAD(ufsvfsp,
846 ip->i_dev, fsbtodb(fs, ob), bsize);
847 if (bp->b_flags & B_ERROR) {
848 err = geterror(bp);
849 brelse(bp);
851 * Return any partial allocations.
853 ufs_undo_allocation(ip,
854 alloced_blocks,
855 undo_table, added_sectors);
856 return (err);
858 bap = bp->b_un.b_daddr;
859 bap[i] = nb;
862 * The magic explained: j will be equal to NIADDR
863 * when we are at the lowest level, this is where the
864 * array entries point directly to data blocks. Since
865 * we will be 'fallocate'ing we will go ahead and negate
866 * the addresses.
868 if (alloc_type == BI_FALLOCATE && j == NIADDR)
869 bap[i] = -bap[i];
871 TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
872 added_sectors += btodb(bsize);
873 ip->i_blocks += btodb(bsize);
874 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
875 TRANS_INODE(ufsvfsp, ip);
876 ip->i_flag |= IUPD | ICHG | IATTCHG;
878 /* Caller is responsible for updating i_seq */
880 undo_table[alloced_blocks-1].owner =
881 ufs_indirect_block;
882 undo_table[alloced_blocks-1].owner_block = ob;
883 undo_table[alloced_blocks-1].owner_offset = i;
885 if (issync) {
886 UFS_BWRITE2(ufsvfsp, bp);
887 if (bp->b_flags & B_ERROR) {
888 err = geterror(bp);
889 brelse(bp);
891 * Return any partial
892 * allocations.
894 ufs_undo_allocation(ip,
895 alloced_blocks,
896 undo_table, added_sectors);
897 return (err);
899 brelse(bp);
900 } else {
901 bdrwrite(bp);
903 } else {
904 brelse(bp);
907 return (0);
911 * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
912 * is in the critical region of wrip().
915 bmap_has_holes(struct inode *ip)
917 struct fs *fs = ip->i_fs;
918 uint_t dblks; /* # of data blocks */
919 uint_t mblks; /* # of data + metadata blocks */
920 int nindirshift;
921 int nindiroffset;
922 uint_t cnt;
923 int n, j, shft;
924 uint_t nindirblks;
926 int fsbshift = fs->fs_bshift;
927 int fsboffset = (1 << fsbshift) - 1;
930 * Check for writer in critical region, if found then we
931 * cannot trust the values of i_size and i_blocks
932 * simply return true.
934 if (ip->i_writer != NULL && ip->i_writer != curthread) {
935 return (1);
938 dblks = (ip->i_size + fsboffset) >> fsbshift;
939 mblks = (ldbtob((uoff_t)ip->i_blocks) + fsboffset) >> fsbshift;
942 * File has only direct blocks.
944 if (dblks <= NDADDR)
945 return (mblks < dblks);
946 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
948 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
949 nindirblks = nindiroffset + 1;
951 dblks -= NDADDR;
952 shft = 0;
954 * Determine how many levels of indirection.
956 for (j = NIADDR; j > 0; j--) {
957 longlong_t sh;
959 shft += nindirshift; /* sh *= nindir */
960 sh = 1LL << shft;
961 if (dblks <= sh)
962 break;
963 dblks -= sh;
965 /* LINTED: warning: logical expression always true: op "||" */
966 ASSERT(NIADDR <= 3);
967 ASSERT(j <= NIADDR);
968 if (j == NIADDR) /* single level indirection */
969 cnt = NDADDR + 1 + dblks;
970 else if (j == NIADDR-1) /* double indirection */
971 cnt = NDADDR + 1 + nindirblks +
972 1 + (dblks + nindiroffset)/nindirblks + dblks;
973 else if (j == NIADDR-2) { /* triple indirection */
974 n = (dblks + nindiroffset)/nindirblks;
975 cnt = NDADDR + 1 + nindirblks +
976 1 + nindirblks + nindirblks*nindirblks +
977 1 + (n + nindiroffset)/nindirblks + n + dblks;
980 return (mblks < cnt);
984 * find some contig blocks starting at *sbp and going for min(n, max_contig)
985 * return the number of blocks (not frags) found.
986 * The array passed in must be at least [0..n-1].
988 static int
989 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
991 register daddr_t bn, nextbn;
992 register daddr32_t *bp;
993 register int diff;
994 int maxtransblk;
996 if (n <= 0)
997 return (0);
998 bn = *sbp;
999 if (bn == 0)
1000 return (0);
1002 diff = fs->fs_frag;
1003 if (*lenp) {
1004 n = MIN(n, lblkno(fs, *lenp));
1005 } else {
1007 * If the user has set the value for maxcontig lower than
1008 * the drive transfer size, then assume they want this
1009 * to be the maximum value for the size of the data transfer.
1011 maxtransblk = maxtransfer >> DEV_BSHIFT;
1012 if (fs->fs_maxcontig < maxtransblk) {
1013 n = MIN(n, fs->fs_maxcontig);
1014 } else {
1015 n = MIN(n, maxtransblk);
1018 bp = sbp;
1019 while (--n > 0) {
1020 nextbn = *(bp + 1);
1021 if (nextbn == 0 || bn + diff != nextbn)
1022 break;
1023 bn = nextbn;
1024 bp++;
1026 return ((int)(bp - sbp) + 1);
1030 * Free any blocks which had been successfully allocated. Always called
1031 * as a result of an error, so we don't bother returning an error code
1032 * from here.
1034 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1035 * Thus it is safe to call this as part of error handling, whether or not
1036 * any blocks have been allocated.
1038 * The ufs_inode_direct case is currently unused.
1041 static void
1042 ufs_undo_allocation(
1043 inode_t *ip,
1044 int block_count,
1045 struct ufs_allocated_block table[],
1046 int inode_sector_adjust)
1048 int i;
1049 int inode_changed;
1050 int error_updating_pointers;
1051 struct ufsvfs *ufsvfsp;
1053 inode_changed = 0;
1054 error_updating_pointers = 0;
1056 ufsvfsp = ip->i_ufsvfs;
1059 * Update pointers on disk before freeing blocks. If we fail,
1060 * some blocks may remain busy; but they will be reclaimed by
1061 * an fsck. (This is better than letting a block wind up with
1062 * two owners if we successfully freed it but could not remove
1063 * the pointer to it.)
1066 for (i = 0; i < block_count; i++) {
1067 switch (table[i].owner) {
1068 case ufs_no_owner:
1069 /* Nothing to do here, nobody points to us */
1070 break;
1071 case ufs_inode_direct:
1072 ASSERT(table[i].owner_offset < NDADDR);
1073 ip->i_db[table[i].owner_offset] = 0;
1074 inode_changed = 1;
1075 break;
1076 case ufs_inode_indirect:
1077 ASSERT(table[i].owner_offset < NIADDR);
1078 ip->i_ib[table[i].owner_offset] = 0;
1079 inode_changed = 1;
1080 break;
1081 case ufs_indirect_block: {
1082 buf_t *bp;
1083 daddr32_t *block_data;
1085 /* Read/modify/log/write. */
1087 ASSERT(table[i].owner_offset <
1088 (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1090 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1091 fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1092 VBSIZE(ITOV(ip)));
1094 if (bp->b_flags & B_ERROR) {
1095 /* Couldn't read this block; give up. */
1096 error_updating_pointers = 1;
1097 brelse(bp);
1098 break; /* out of SWITCH */
1101 block_data = bp->b_un.b_daddr;
1102 block_data[table[i].owner_offset] = 0;
1104 /* Write a log entry which includes the zero. */
1105 /* It might be possible to optimize this by using */
1106 /* TRANS_BUF directly and zeroing only the four */
1107 /* bytes involved, but an attempt to do that led */
1108 /* to panics in the logging code. The attempt was */
1109 /* TRANS_BUF(ufsvfsp, */
1110 /* table[i].owner_offset * sizeof (daddr32_t), */
1111 /* sizeof (daddr32_t), */
1112 /* bp, */
1113 /* DT_ABZERO); */
1115 TRANS_BUF_ITEM_128(ufsvfsp,
1116 block_data[table[i].owner_offset],
1117 block_data, bp, DT_AB);
1119 /* Now we can write the buffer itself. */
1121 UFS_BWRITE2(ufsvfsp, bp);
1123 if (bp->b_flags & B_ERROR) {
1124 error_updating_pointers = 1;
1127 brelse(bp);
1128 break;
1130 default:
1131 (void) ufs_fault(ITOV(ip),
1132 "ufs_undo_allocation failure\n");
1133 break;
1138 * If the inode changed, or if we need to update its block count,
1139 * then do that now. We update the inode synchronously on disk
1140 * to ensure that it won't transiently point at a block we've
1141 * freed (only necessary if we're not logging).
1143 * NOTE: Currently ufs_iupdat() does not check for errors. When
1144 * it is fixed, we should verify that we successfully updated the
1145 * inode before freeing blocks below.
1148 if (inode_changed || (inode_sector_adjust != 0)) {
1149 ip->i_blocks -= inode_sector_adjust;
1150 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1151 TRANS_INODE(ufsvfsp, ip);
1152 ip->i_flag |= IUPD | ICHG | IATTCHG;
1153 ip->i_seq++;
1154 if (!TRANS_ISTRANS(ufsvfsp))
1155 ufs_iupdat(ip, I_SYNC);
1159 * Now we go through and actually free the blocks, but only if we
1160 * successfully removed the pointers to them.
1163 if (!error_updating_pointers) {
1164 for (i = 0; i < block_count; i++) {
1165 free(ip, table[i].this_block, table[i].block_size,
1166 table[i].usage_flags);
1172 * Find the next hole or data block in file starting at *off
1173 * Return found offset in *off, which can be less than the
1174 * starting offset if not block aligned.
1175 * This code is based on bmap_read().
1176 * Errors: ENXIO for end of file
1177 * EIO for block read error.
1180 bmap_find(struct inode *ip, boolean_t hole, uoff_t *off)
1182 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1183 struct fs *fs = ufsvfsp->vfs_fs;
1184 buf_t *bp[NIADDR];
1185 int i, j;
1186 int shft; /* we maintain sh = 1 << shft */
1187 int nindirshift, nindiroffset;
1188 daddr_t ob, nb, tbn, lbn, skip;
1189 daddr32_t *bap;
1190 uoff_t isz = (offset_t)ip->i_size;
1191 int32_t bs = fs->fs_bsize; /* file system block size */
1192 int32_t nindir = fs->fs_nindir;
1193 dev_t dev;
1194 int error = 0;
1195 daddr_t limits[NIADDR];
1197 ASSERT(*off < isz);
1198 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1199 lbn = (daddr_t)lblkno(fs, *off);
1200 ASSERT(lbn >= 0);
1202 for (i = 0; i < NIADDR; i++)
1203 bp[i] = NULL;
1206 * The first NDADDR blocks are direct blocks.
1208 if (lbn < NDADDR) {
1209 for (; lbn < NDADDR; lbn++) {
1210 if ((hole && (ip->i_db[lbn] == 0)) ||
1211 (!hole && (ip->i_db[lbn] != 0))) {
1212 goto out;
1215 if ((uoff_t)lbn << fs->fs_bshift >= isz)
1216 goto out;
1219 nindir = fs->fs_nindir;
1220 nindirshift = ufsvfsp->vfs_nindirshift;
1221 nindiroffset = ufsvfsp->vfs_nindiroffset;
1222 dev = ip->i_dev;
1224 /* Set up limits array */
1225 for (limits[0] = NDADDR, j = 1; j < NIADDR; j++)
1226 limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1228 loop:
1230 * Determine how many levels of indirection.
1232 shft = 0; /* sh = 1 */
1233 tbn = lbn - NDADDR;
1234 for (j = NIADDR; j > 0; j--) {
1235 longlong_t sh;
1237 shft += nindirshift; /* sh *= nindir */
1238 sh = 1LL << shft;
1239 if (tbn < sh)
1240 break;
1241 tbn -= sh;
1243 if (j == 0) {
1244 /* must have passed end of file */
1245 ASSERT(((uoff_t)lbn << fs->fs_bshift) >= isz);
1246 goto out;
1250 * Fetch the first indirect block.
1252 nb = ip->i_ib[NIADDR - j];
1253 if (nb == 0) {
1254 if (hole) {
1255 lbn = limits[NIADDR - j];
1256 goto out;
1257 } else {
1258 lbn = limits[NIADDR - j + 1];
1259 if ((uoff_t)lbn << fs->fs_bshift >= isz)
1260 goto out;
1261 goto loop;
1266 * Fetch through the indirect blocks.
1268 for (; ((j <= NIADDR) && (nb != 0)); j++) {
1269 ob = nb;
1271 * if there's a different block at this level then release
1272 * the old one and in with the new.
1274 if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1275 if (bp[j-1] != NULL)
1276 brelse(bp[j-1]);
1277 bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1278 if (bp[j-1]->b_flags & B_ERROR) {
1279 error = EIO;
1280 goto out;
1283 bap = bp[j-1]->b_un.b_daddr;
1285 shft -= nindirshift; /* sh / nindir */
1286 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1287 nb = bap[i];
1288 skip = 1LL << (nindirshift * (NIADDR - j));
1292 * Scan through the blocks in this array.
1294 for (; i < nindir; i++, lbn += skip) {
1295 if (hole && (bap[i] == 0))
1296 goto out;
1297 if (!hole && (bap[i] != 0)) {
1298 if (skip == 1) {
1299 /* we're at the lowest level */
1300 goto out;
1301 } else {
1302 goto loop;
1306 if (((uoff_t)lbn << fs->fs_bshift) < isz)
1307 goto loop;
1308 out:
1309 for (i = 0; i < NIADDR; i++) {
1310 if (bp[i])
1311 brelse(bp[i]);
1313 if (error == 0) {
1314 if (((uoff_t)lbn << fs->fs_bshift) >= isz) {
1315 error = ENXIO;
1316 } else {
1317 /* success */
1318 *off = (uoff_t)lbn << fs->fs_bshift;
1321 return (error);
1325 * Set a particular offset in the inode list to be a certain block.
1326 * User is responsible for calling TRANS* functions
1329 bmap_set_bn(struct vnode *vp, uoff_t off, daddr32_t bn)
1331 daddr_t lbn;
1332 struct inode *ip;
1333 ufsvfs_t *ufsvfsp;
1334 struct fs *fs;
1335 struct buf *bp;
1336 int i, j;
1337 int shft; /* we maintain sh = 1 << shft */
1338 int err;
1339 daddr_t ob, nb, tbn;
1340 daddr32_t *bap;
1341 int nindirshift, nindiroffset;
1343 ip = VTOI(vp);
1344 ufsvfsp = ip->i_ufsvfs;
1345 fs = ufsvfsp->vfs_fs;
1346 lbn = (daddr_t)lblkno(fs, off);
1348 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1350 if (lbn < 0)
1351 return (EFBIG);
1354 * Take care of direct block assignment
1356 if (lbn < NDADDR) {
1357 ip->i_db[lbn] = bn;
1358 return (0);
1361 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1362 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1364 * Determine how many levels of indirection.
1366 shft = 0; /* sh = 1 */
1367 tbn = lbn - NDADDR;
1368 for (j = NIADDR; j > 0; j--) {
1369 longlong_t sh;
1371 shft += nindirshift; /* sh *= nindir */
1372 sh = 1LL << shft;
1373 if (tbn < sh)
1374 break;
1375 tbn -= sh;
1377 if (j == 0)
1378 return (EFBIG);
1381 * Fetch the first indirect block.
1383 nb = ip->i_ib[NIADDR - j];
1384 if (nb == 0) {
1385 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1386 return (err);
1390 * Fetch through the indirect blocks.
1392 for (; j <= NIADDR; j++) {
1393 ob = nb;
1394 bp = UFS_BREAD(ufsvfsp,
1395 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1396 if (bp->b_flags & B_ERROR) {
1397 err = geterror(bp);
1398 brelse(bp);
1399 return (err);
1401 bap = bp->b_un.b_daddr;
1403 ASSERT(!ufs_indir_badblock(ip, bap));
1405 shft -= nindirshift; /* sh / nindir */
1406 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1408 nb = bap[i];
1409 if (nb == 0) {
1410 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1411 return (err);
1414 if (j == NIADDR) {
1415 bap[i] = bn;
1416 bdrwrite(bp);
1417 return (0);
1420 brelse(bp);
1422 return (0);