4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 #include <sys/condvar_impl.h>
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/debug.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/signal.h>
52 #include <sys/vnode.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_inode.h>
56 #include <sys/fs/ufs_acl.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <sys/fs/ufs_quota.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/errno.h>
64 #include <sys/sysmacros.h>
66 #include <sys/fcntl.h>
67 #include <sys/flock.h>
68 #include <sys/fs_subr.h>
69 #include <sys/cmn_err.h>
70 #include <sys/policy.h>
71 #include <sys/fs/ufs_log.h>
73 static ino_t
hashalloc();
74 static daddr_t
fragextend();
75 static daddr_t
alloccg();
76 static daddr_t
alloccgblk();
77 static ino_t
ialloccg();
78 static daddr_t
mapsearch();
79 static int findlogstartcg();
81 extern int inside
[], around
[];
82 extern uchar_t
*fragtbl
[];
86 * Allocate a block in the file system.
88 * The size of the requested block is given, which must be some
89 * multiple of fs_fsize and <= fs_bsize.
90 * A preference may be optionally specified. If a preference is given
91 * the following hierarchy is used to allocate a block:
92 * 1) allocate the requested block.
93 * 2) allocate a rotationally optimal block in the same cylinder.
94 * 3) allocate a block in the same cylinder group.
95 * 4) quadratically rehash into other cylinder groups, until an
96 * available block is located.
97 * If no block preference is given the following hierarchy is used
98 * to allocate a block:
99 * 1) allocate a block in the cylinder group that contains the
100 * inode for the file.
101 * 2) quadratically rehash into other cylinder groups, until an
102 * available block is located.
105 alloc(struct inode
*ip
, daddr_t bpref
, int size
, daddr_t
*bnp
, cred_t
*cr
)
108 struct ufsvfs
*ufsvfsp
;
116 ufsvfsp
= ip
->i_ufsvfs
;
117 fs
= ufsvfsp
->vfs_fs
;
118 if ((unsigned)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
119 err
= ufs_fault(ITOV(ip
), "alloc: bad size, dev = 0x%lx,"
120 " bsize = %d, size = %d, fs = %s\n",
121 ip
->i_dev
, fs
->fs_bsize
, size
, fs
->fs_fsmnt
);
124 if (size
== fs
->fs_bsize
&& fs
->fs_cstotal
.cs_nbfree
== 0)
126 if (freespace(fs
, ufsvfsp
) <= 0 &&
127 secpolicy_fs_minfree(cr
, ufsvfsp
->vfs_vfs
) != 0)
129 err
= chkdq(ip
, (long)btodb(size
), 0, cr
, &errmsg
, &len
);
130 /* Note that may not have err, but may have errmsg */
131 if (errmsg
!= NULL
) {
133 kmem_free(errmsg
, len
);
138 if (bpref
>= fs
->fs_size
)
141 cg
= (int)itog(fs
, ip
->i_number
);
143 cg
= dtog(fs
, bpref
);
145 bno
= (daddr_t
)hashalloc(ip
, cg
, (long)bpref
, size
,
146 (ulong_t (*)())alloccg
);
153 * hashalloc() failed because some other thread grabbed
154 * the last block so unwind the quota operation. We can
155 * ignore the return because subtractions don't fail and
156 * size is guaranteed to be >= zero by our caller.
158 (void) chkdq(ip
, -(long)btodb(size
), 0, cr
, (char **)NULL
, NULL
);
161 now
= ddi_get_lbolt();
162 mutex_enter(&ufsvfsp
->vfs_lock
);
163 if ((now
- ufsvfsp
->vfs_lastwhinetime
) > (hz
<< 2) &&
164 (!(TRANS_ISTRANS(ufsvfsp
)) || !(ip
->i_flag
& IQUIET
))) {
165 ufsvfsp
->vfs_lastwhinetime
= now
;
166 cmn_err(CE_NOTE
, "alloc: %s: file system full", fs
->fs_fsmnt
);
168 mutex_exit(&ufsvfsp
->vfs_lock
);
173 * Reallocate a fragment to a bigger size
175 * The number and size of the old block is given, and a preference
176 * and new size is also specified. The allocator attempts to extend
177 * the original block. Failing that, the regular block allocator is
178 * invoked to get an appropriate block.
181 realloccg(struct inode
*ip
, daddr_t bprev
, daddr_t bpref
, int osize
,
182 int nsize
, daddr_t
*bnp
, cred_t
*cr
)
186 struct ufsvfs
*ufsvfsp
;
193 ufsvfsp
= ip
->i_ufsvfs
;
194 fs
= ufsvfsp
->vfs_fs
;
195 if ((unsigned)osize
> fs
->fs_bsize
|| fragoff(fs
, osize
) != 0 ||
196 (unsigned)nsize
> fs
->fs_bsize
|| fragoff(fs
, nsize
) != 0) {
197 err
= ufs_fault(ITOV(ip
),
198 "realloccg: bad size, dev=0x%lx, bsize=%d, "
199 "osize=%d, nsize=%d, fs=%s\n",
200 ip
->i_dev
, fs
->fs_bsize
, osize
, nsize
, fs
->fs_fsmnt
);
203 if (freespace(fs
, ufsvfsp
) <= 0 &&
204 secpolicy_fs_minfree(cr
, ufsvfsp
->vfs_vfs
) != 0)
207 err
= ufs_fault(ITOV(ip
),
208 "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
209 " bprev = %ld, fs = %s\n", ip
->i_dev
, fs
->fs_bsize
, bprev
,
213 err
= chkdq(ip
, (long)btodb(nsize
- osize
), 0, cr
, &errmsg
, &len
);
214 /* Note that may not have err, but may have errmsg */
215 if (errmsg
!= NULL
) {
217 kmem_free(errmsg
, len
);
222 cg
= dtog(fs
, bprev
);
223 bno
= fragextend(ip
, cg
, (long)bprev
, osize
, nsize
);
228 if (bpref
>= fs
->fs_size
)
232 * When optimizing for time we allocate a full block and
233 * then only use the upper portion for this request. When
234 * this file grows again it will grow into the unused portion
235 * of the block (See fragextend() above). This saves time
236 * because an extra disk write would be needed if the frags
237 * following the current allocation were not free. The extra
238 * disk write is needed to move the data from its current
239 * location into the newly allocated position.
241 * When optimizing for space we allocate a run of frags
242 * that is just the right size for this request.
244 request
= (fs
->fs_optim
== FS_OPTTIME
) ? fs
->fs_bsize
: nsize
;
245 bno
= (daddr_t
)hashalloc(ip
, cg
, (long)bpref
, request
,
246 (ulong_t (*)())alloccg
);
250 (void) free(ip
, bno
+ numfrags(fs
, nsize
),
251 (off_t
)(request
- nsize
), I_NOCANCEL
);
256 * hashalloc() failed because some other thread grabbed
257 * the last block so unwind the quota operation. We can
258 * ignore the return because subtractions don't fail, and
259 * our caller guarantees nsize >= osize.
261 (void) chkdq(ip
, -(long)btodb(nsize
- osize
), 0, cr
, (char **)NULL
,
265 now
= ddi_get_lbolt();
266 mutex_enter(&ufsvfsp
->vfs_lock
);
267 if ((now
- ufsvfsp
->vfs_lastwhinetime
) > (hz
<< 2) &&
268 (!(TRANS_ISTRANS(ufsvfsp
)) || !(ip
->i_flag
& IQUIET
))) {
269 ufsvfsp
->vfs_lastwhinetime
= now
;
271 "realloccg %s: file system full", fs
->fs_fsmnt
);
273 mutex_exit(&ufsvfsp
->vfs_lock
);
278 * Allocate an inode in the file system.
280 * A preference may be optionally specified. If a preference is given
281 * the following hierarchy is used to allocate an inode:
282 * 1) allocate the requested inode.
283 * 2) allocate an inode in the same cylinder group.
284 * 3) quadratically rehash into other cylinder groups, until an
285 * available inode is located.
286 * If no inode preference is given the following hierarchy is used
287 * to allocate an inode:
288 * 1) allocate an inode in cylinder group 0.
289 * 2) quadratically rehash into other cylinder groups, until an
290 * available inode is located.
293 ufs_ialloc(struct inode
*pip
,
294 ino_t ipref
, mode_t mode
, struct inode
**ipp
, cred_t
*cr
)
302 struct ufsvfs
*ufsvfsp
= pip
->i_ufsvfs
;
306 ASSERT(RW_WRITE_HELD(&pip
->i_rwlock
));
309 nifree
= fs
->fs_cstotal
.cs_nifree
;
314 * Shadow inodes don't count against a user's inode allocation.
315 * They are an implementation method and not a resource.
317 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
318 err
= chkiq((struct ufsvfs
*)ITOV(pip
)->v_vfsp
->vfs_data
,
319 /* change */ 1, NULL
, crgetuid(cr
), 0,
322 * As we haven't acquired any locks yet, dump the message
325 if (errmsg
!= NULL
) {
327 kmem_free(errmsg
, len
);
334 if (ipref
>= (ulong_t
)(fs
->fs_ncg
* fs
->fs_ipg
))
336 cg
= (int)itog(fs
, ipref
);
337 ino
= (ino_t
)hashalloc(pip
, cg
, (long)ipref
, (int)mode
,
338 (ulong_t (*)())ialloccg
);
340 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
342 * We can safely ignore the return from chkiq()
343 * because deallocations can only fail if we
344 * can't get the user's quota info record off
345 * the disk due to an I/O error. In that case,
346 * the quota subsystem is already messed up.
348 (void) chkiq(ufsvfsp
, /* change */ -1, NULL
,
349 crgetuid(cr
), 0, cr
, (char **)NULL
, NULL
);
353 err
= ufs_iget(pip
->i_vfs
, ino
, ipp
, cr
);
355 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
357 * See above comment about why it is safe to ignore an
360 (void) chkiq(ufsvfsp
, /* change */ -1, NULL
,
361 crgetuid(cr
), 0, cr
, (char **)NULL
, NULL
);
363 ufs_ifree(pip
, ino
, 0);
367 ASSERT(!ip
->i_ufs_acl
);
368 ASSERT(!ip
->i_dquot
);
369 rw_enter(&ip
->i_contents
, RW_WRITER
);
372 * Check if we really got a free inode, if not then complain
373 * and mark the inode ISTALE so that it will be freed by the
374 * ufs idle thread eventually and will not be sent to ufs_delete().
376 if (ip
->i_mode
|| (ip
->i_nlink
> 0)) {
377 ip
->i_flag
|= ISTALE
;
378 rw_exit(&ip
->i_contents
);
381 "%s: unexpected allocated inode %d, run fsck(1M)%s",
382 fs
->fs_fsmnt
, (int)ino
,
383 (TRANS_ISTRANS(ufsvfsp
) ? " -o f" : ""));
388 * Check the inode has no size or data blocks.
389 * This could have happened if the truncation failed when
390 * deleting the inode. It used to be possible for this to occur
391 * if a block allocation failed when iteratively truncating a
392 * large file using logging and with a full file system.
393 * This was fixed with bug fix 4348738. However, truncation may
394 * still fail on an IO error. So in all cases for safety and
395 * security we clear out the size; the blocks allocated; and
396 * pointers to the blocks. This will ultimately cause a fsck
397 * error of un-accounted for blocks, but its a fairly benign error,
398 * and possibly the correct thing to do anyway as accesssing those
399 * blocks agains may lead to more IO errors.
401 if (ip
->i_size
|| ip
->i_blocks
) {
406 "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
407 fs
->fs_fsmnt
, (int)ino
, ip
->i_size
,
408 (TRANS_ISTRANS(ufsvfsp
) ? " -o f" : ""));
411 * Clear any garbage left behind.
415 for (i
= 0; i
< NDADDR
; i
++)
417 for (i
= 0; i
< NIADDR
; i
++)
422 * Initialize the link count
427 * Clear the old flags
432 * Access times are not really defined if the fs is mounted
433 * with 'noatime'. But it can cause nfs clients to fail
434 * open() if the atime is not a legal value. Set a legal value
435 * here when the inode is allocated.
437 if (ufsvfsp
->vfs_noatime
) {
438 mutex_enter(&ufs_iuniqtime_lock
);
439 ip
->i_atime
= iuniqtime
;
440 mutex_exit(&ufs_iuniqtime_lock
);
442 rw_exit(&ip
->i_contents
);
445 if (!(TRANS_ISTRANS(ufsvfsp
)) || !(pip
->i_flag
& IQUIET
))
446 cmn_err(CE_NOTE
, "%s: out of inodes\n", fs
->fs_fsmnt
);
451 * Find a cylinder group to place a directory.
452 * Returns an inumber within the selected cylinder group.
453 * Note, the vfs_lock is not needed as we don't require exact cg summary info.
455 * If the switch ufs_close_dirs is set, then the policy is to use
456 * the current cg if it has more than 25% free inodes and more
457 * than 25% free blocks. Otherwise the cgs are searched from
458 * the beginning and the first cg with the same criteria is
459 * used. If that is also null then we revert to the old algorithm.
460 * This tends to cluster files at the beginning of the disk
461 * until the disk gets full.
463 * Otherwise if ufs_close_dirs is not set then the original policy is
464 * used which is to select from among those cylinder groups with
465 * above the average number of free inodes, the one with the smallest
466 * number of directories.
469 int ufs_close_dirs
= 1; /* allocate directories close as possible */
474 int cg
, minndir
, mincg
, avgifree
, mininode
, minbpg
, ifree
;
475 struct fs
*fs
= dp
->i_fs
;
477 cg
= itog(fs
, dp
->i_number
);
478 mininode
= fs
->fs_ipg
>> 2;
479 minbpg
= fs
->fs_maxbpg
>> 2;
480 if (ufs_close_dirs
&&
481 (fs
->fs_cs(fs
, cg
).cs_nifree
> mininode
) &&
482 (fs
->fs_cs(fs
, cg
).cs_nbfree
> minbpg
)) {
483 return (dp
->i_number
);
486 avgifree
= fs
->fs_cstotal
.cs_nifree
/ fs
->fs_ncg
;
487 minndir
= fs
->fs_ipg
;
489 for (cg
= 0; cg
< fs
->fs_ncg
; cg
++) {
490 ifree
= fs
->fs_cs(fs
, cg
).cs_nifree
;
491 if (ufs_close_dirs
&&
492 (ifree
> mininode
) &&
493 (fs
->fs_cs(fs
, cg
).cs_nbfree
> minbpg
)) {
494 return ((ino_t
)(fs
->fs_ipg
* cg
));
496 if ((fs
->fs_cs(fs
, cg
).cs_ndir
< minndir
) &&
497 (ifree
>= avgifree
)) {
499 minndir
= fs
->fs_cs(fs
, cg
).cs_ndir
;
502 return ((ino_t
)(fs
->fs_ipg
* mincg
));
506 * Select the desired position for the next block in a file. The file is
507 * logically divided into sections. The first section is composed of the
508 * direct blocks. Each additional section contains fs_maxbpg blocks.
510 * If no blocks have been allocated in the first section, the policy is to
511 * request a block in the same cylinder group as the inode that describes
512 * the file. If no blocks have been allocated in any other section, the
513 * policy is to place the section in a cylinder group with a greater than
514 * average number of free blocks. An appropriate cylinder group is found
515 * by using a rotor that sweeps the cylinder groups. When a new group of
516 * blocks is needed, the sweep begins in the cylinder group following the
517 * cylinder group from which the previous allocation was made. The sweep
518 * continues until a cylinder group with greater than the average number
519 * of free blocks is found. If the allocation is for the first block in an
520 * indirect block, the information on the previous allocation is unavailable;
521 * here a best guess is made based upon the logical block number being
524 * If a section is already partially allocated, the policy is to
525 * contiguously allocate fs_maxcontig blocks. The end of one of these
526 * contiguous blocks and the beginning of the next is physically separated
527 * so that the disk head will be in transit between them for at least
528 * fs_rotdelay milliseconds. This is to allow time for the processor to
529 * schedule another I/O transfer.
532 blkpref(struct inode
*ip
, daddr_t lbn
, int indx
, daddr32_t
*bap
)
535 struct ufsvfs
*ufsvfsp
;
537 int avgbfree
, startcg
;
540 ufsvfsp
= ip
->i_ufsvfs
;
542 if (indx
% fs
->fs_maxbpg
== 0 || bap
[indx
- 1] == 0) {
544 cg
= itog(fs
, ip
->i_number
);
545 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
548 * Find a cylinder with greater than average
549 * number of unused data blocks.
551 if (indx
== 0 || bap
[indx
- 1] == 0)
552 startcg
= itog(fs
, ip
->i_number
) + lbn
/ fs
->fs_maxbpg
;
554 startcg
= dtog(fs
, bap
[indx
- 1]) + 1;
555 startcg
%= fs
->fs_ncg
;
557 mutex_enter(&ufsvfsp
->vfs_lock
);
558 avgbfree
= fs
->fs_cstotal
.cs_nbfree
/ fs
->fs_ncg
;
560 * used for computing log space for writes/truncs
562 ufsvfsp
->vfs_avgbfree
= avgbfree
;
563 for (cg
= startcg
; cg
< fs
->fs_ncg
; cg
++)
564 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
566 mutex_exit(&ufsvfsp
->vfs_lock
);
567 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
569 for (cg
= 0; cg
<= startcg
; cg
++)
570 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
572 mutex_exit(&ufsvfsp
->vfs_lock
);
573 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
575 mutex_exit(&ufsvfsp
->vfs_lock
);
579 * One or more previous blocks have been laid out. If less
580 * than fs_maxcontig previous blocks are contiguous, the
581 * next block is requested contiguously, otherwise it is
582 * requested rotationally delayed by fs_rotdelay milliseconds.
585 nextblk
= bap
[indx
- 1];
587 * Provision for fallocate to return positive
588 * blk preference based on last allocation
590 if (nextblk
< 0 && nextblk
!= UFS_HOLE
) {
591 nextblk
= (-bap
[indx
- 1]) + fs
->fs_frag
;
593 nextblk
= bap
[indx
- 1] + fs
->fs_frag
;
596 if (indx
> fs
->fs_maxcontig
&& bap
[indx
- fs
->fs_maxcontig
] +
597 blkstofrags(fs
, fs
->fs_maxcontig
) != nextblk
) {
600 if (fs
->fs_rotdelay
!= 0)
602 * Here we convert ms of delay to frags as:
603 * (frags) = (ms) * (rev/sec) * (sect/rev) /
604 * ((sect/frag) * (ms/sec))
605 * then round up to the next block.
607 nextblk
+= roundup(fs
->fs_rotdelay
* fs
->fs_rps
* fs
->fs_nsect
/
608 (NSPF(fs
) * 1000), fs
->fs_frag
);
613 * Free a block or fragment.
615 * The specified block or fragment is placed back in the
616 * free map. If a fragment is deallocated, a possible
617 * block reassembly is checked.
620 free(struct inode
*ip
, daddr_t bno
, off_t size
, int flags
)
622 struct fs
*fs
= ip
->i_fs
;
623 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
624 struct ufs_q
*delq
= &ufsvfsp
->vfs_delete
;
625 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
633 daddr_t blkno
, cylno
, rpos
;
636 * fallocate'd files will have negative block address.
637 * So negate it again to get original block address.
639 if (bno
< 0 && (bno
% fs
->fs_frag
== 0) && bno
!= UFS_HOLE
) {
643 if ((unsigned long)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
644 (void) ufs_fault(ITOV(ip
),
645 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
646 "fs = %s\n", ip
->i_dev
, fs
->fs_bsize
,
647 (int)size
, fs
->fs_fsmnt
);
651 ASSERT(!ufs_badblock(ip
, bno
));
652 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
656 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
661 if (!(flags
& I_NOCANCEL
))
662 TRANS_CANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, bno
)), size
, flags
);
663 if (flags
& (I_DIR
|I_IBLK
|I_SHAD
|I_QUOTA
)) {
664 TRANS_MATA_FREE(ufsvfsp
, ldbtob(fsbtodb(fs
, bno
)), size
);
666 blksfree
= cg_blksfree(cgp
);
667 blktot
= cg_blktot(cgp
);
668 mutex_enter(&ufsvfsp
->vfs_lock
);
669 cgp
->cg_time
= gethrestime_sec();
670 bno
= dtogd(fs
, bno
);
671 if (size
== fs
->fs_bsize
) {
672 blkno
= fragstoblks(fs
, bno
);
673 cylno
= cbtocylno(fs
, bno
);
674 rpos
= cbtorpos(ufsvfsp
, bno
);
675 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
676 if (!isclrblock(fs
, blksfree
, blkno
)) {
677 mutex_exit(&ufsvfsp
->vfs_lock
);
679 (void) ufs_fault(ITOV(ip
), "free: freeing free block, "
680 "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
681 ip
->i_dev
, bno
, ip
->i_number
, fs
->fs_fsmnt
);
684 setblock(fs
, blksfree
, blkno
);
687 cgp
->cg_cs
.cs_nbfree
++; /* Log below */
688 fs
->fs_cstotal
.cs_nbfree
++;
689 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
690 if (TRANS_ISTRANS(ufsvfsp
) && (flags
& I_ACCT
)) {
691 mutex_enter(&delq
->uq_mutex
);
692 delq_info
->delq_unreclaimed_blocks
-=
694 mutex_exit(&delq
->uq_mutex
);
697 bbase
= bno
- fragnum(fs
, bno
);
699 * Decrement the counts associated with the old frags
701 bmap
= blkmap(fs
, blksfree
, bbase
);
702 fragacct(fs
, bmap
, cgp
->cg_frsum
, -1);
704 * Deallocate the fragment
706 for (i
= 0; i
< numfrags(fs
, size
); i
++) {
707 if (isset(blksfree
, bno
+ i
)) {
709 mutex_exit(&ufsvfsp
->vfs_lock
);
710 (void) ufs_fault(ITOV(ip
),
711 "free: freeing free frag, "
712 "dev:0x%lx, blk:%ld, cg:%d, "
721 setbit(blksfree
, bno
+ i
);
723 cgp
->cg_cs
.cs_nffree
+= i
;
724 fs
->fs_cstotal
.cs_nffree
+= i
;
725 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
726 if (TRANS_ISTRANS(ufsvfsp
) && (flags
& I_ACCT
)) {
727 mutex_enter(&delq
->uq_mutex
);
728 delq_info
->delq_unreclaimed_blocks
-=
729 btodb(i
* fs
->fs_fsize
);
730 mutex_exit(&delq
->uq_mutex
);
733 * Add back in counts associated with the new frags
735 bmap
= blkmap(fs
, blksfree
, bbase
);
736 fragacct(fs
, bmap
, cgp
->cg_frsum
, 1);
738 * If a complete block has been reassembled, account for it
740 blkno
= fragstoblks(fs
, bbase
);
741 if (isblock(fs
, blksfree
, blkno
)) {
742 cylno
= cbtocylno(fs
, bbase
);
743 rpos
= cbtorpos(ufsvfsp
, bbase
);
744 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
747 cgp
->cg_cs
.cs_nffree
-= fs
->fs_frag
;
748 fs
->fs_cstotal
.cs_nffree
-= fs
->fs_frag
;
749 fs
->fs_cs(fs
, cg
).cs_nffree
-= fs
->fs_frag
;
750 cgp
->cg_cs
.cs_nbfree
++;
751 fs
->fs_cstotal
.cs_nbfree
++;
752 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
756 ufs_notclean(ufsvfsp
);
757 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
758 TRANS_SI(ufsvfsp
, fs
, cg
);
765 * The specified inode is placed back in the free map.
768 ufs_ifree(struct inode
*ip
, ino_t ino
, mode_t mode
)
770 struct fs
*fs
= ip
->i_fs
;
771 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
778 if (ip
->i_number
== ino
&& ip
->i_mode
!= 0) {
779 (void) ufs_fault(ITOV(ip
),
780 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
782 ip
->i_mode
, mode
, (int)ip
->i_number
, fs
->fs_fsmnt
);
785 if (ino
>= fs
->fs_ipg
* fs
->fs_ncg
) {
786 (void) ufs_fault(ITOV(ip
),
787 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
788 (int)ip
->i_dev
, (int)ino
, fs
->fs_fsmnt
);
791 cg
= (int)itog(fs
, ino
);
792 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
796 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
800 mutex_enter(&ufsvfsp
->vfs_lock
);
801 cgp
->cg_time
= gethrestime_sec();
802 iused
= cg_inosused(cgp
);
803 inot
= (unsigned int)(ino
% (ulong_t
)fs
->fs_ipg
);
804 if (isclr(iused
, inot
)) {
805 mutex_exit(&ufsvfsp
->vfs_lock
);
807 (void) ufs_fault(ITOV(ip
), "ufs_ifree: freeing free inode, "
808 "mode: (imode) %o, (omode) %o, ino:%d, "
810 ip
->i_mode
, mode
, (int)ino
, fs
->fs_fsmnt
);
815 if (inot
< (ulong_t
)cgp
->cg_irotor
)
816 cgp
->cg_irotor
= inot
;
817 cgp
->cg_cs
.cs_nifree
++;
818 fs
->fs_cstotal
.cs_nifree
++;
819 fs
->fs_cs(fs
, cg
).cs_nifree
++;
820 if (((mode
& IFMT
) == IFDIR
) || ((mode
& IFMT
) == IFATTRDIR
)) {
821 cgp
->cg_cs
.cs_ndir
--;
822 fs
->fs_cstotal
.cs_ndir
--;
823 fs
->fs_cs(fs
, cg
).cs_ndir
--;
826 ufs_notclean(ufsvfsp
);
827 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
828 TRANS_SI(ufsvfsp
, fs
, cg
);
833 * Implement the cylinder overflow algorithm.
835 * The policy implemented by this algorithm is:
836 * 1) allocate the block in its requested cylinder group.
837 * 2) quadratically rehash on the cylinder group number.
838 * 3) brute force search for a free block.
839 * The size parameter means size for data blocks, mode for inodes.
842 hashalloc(struct inode
*ip
, int cg
, long pref
, int size
, ulong_t (*allocator
)())
851 * 1: preferred cylinder group
853 result
= (*allocator
)(ip
, cg
, pref
, size
);
857 * 2: quadratic rehash
859 for (i
= 1; i
< fs
->fs_ncg
; i
*= 2) {
861 if (cg
>= fs
->fs_ncg
)
863 result
= (*allocator
)(ip
, cg
, 0, size
);
868 * 3: brute force search
869 * Note that we start at i == 2, since 0 was checked initially,
870 * and 1 is always checked in the quadratic rehash.
872 cg
= (icg
+ 2) % fs
->fs_ncg
;
873 for (i
= 2; i
< fs
->fs_ncg
; i
++) {
874 result
= (*allocator
)(ip
, cg
, 0, size
);
878 if (cg
== fs
->fs_ncg
)
885 * Determine whether a fragment can be extended.
887 * Check to see if the necessary fragments are available, and
888 * if they are, allocate them.
891 fragextend(struct inode
*ip
, int cg
, long bprev
, int osize
, int nsize
)
893 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
894 struct fs
*fs
= ip
->i_fs
;
902 if (fs
->fs_cs(fs
, cg
).cs_nffree
< numfrags(fs
, nsize
- osize
))
904 frags
= numfrags(fs
, nsize
);
905 bbase
= (int)fragnum(fs
, bprev
);
906 if (bbase
> fragnum(fs
, (bprev
+ frags
- 1))) {
907 /* cannot extend across a block boundary */
911 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
914 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
919 blksfree
= cg_blksfree(cgp
);
920 mutex_enter(&ufsvfsp
->vfs_lock
);
921 bno
= dtogd(fs
, bprev
);
922 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
923 if (isclr(blksfree
, bno
+ i
)) {
924 mutex_exit(&ufsvfsp
->vfs_lock
);
928 if ((TRANS_ISCANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, bprev
+ i
)),
930 mutex_exit(&ufsvfsp
->vfs_lock
);
936 cgp
->cg_time
= gethrestime_sec();
938 * The current fragment can be extended,
939 * deduct the count on fragment being extended into
940 * increase the count on the remaining fragment (if any)
941 * allocate the extended piece.
943 for (i
= frags
; i
< fs
->fs_frag
- bbase
; i
++)
944 if (isclr(blksfree
, bno
+ i
))
946 j
= i
- numfrags(fs
, osize
);
948 ASSERT(cgp
->cg_frsum
[j
] >= 0);
950 cgp
->cg_frsum
[i
- frags
]++;
951 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
952 clrbit(blksfree
, bno
+ i
);
953 cgp
->cg_cs
.cs_nffree
--;
954 fs
->fs_cs(fs
, cg
).cs_nffree
--;
955 fs
->fs_cstotal
.cs_nffree
--;
958 ufs_notclean(ufsvfsp
);
959 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
960 TRANS_SI(ufsvfsp
, fs
, cg
);
962 return ((daddr_t
)bprev
);
966 * Determine whether a block can be allocated.
968 * Check to see if a block of the apprpriate size
969 * is available, and if it is, allocate it.
972 alloccg(struct inode
*ip
, int cg
, daddr_t bpref
, int size
)
974 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
975 struct fs
*fs
= ip
->i_fs
;
984 * Searching for space could be time expensive so do some
985 * up front checking to verify that there is actually space
986 * available (free blocks or free frags).
988 if (fs
->fs_cs(fs
, cg
).cs_nbfree
== 0) {
989 if (size
== fs
->fs_bsize
)
993 * If there are not enough free frags then return.
995 if (fs
->fs_cs(fs
, cg
).cs_nffree
< numfrags(fs
, size
))
999 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1000 (int)fs
->fs_cgsize
);
1002 cgp
= bp
->b_un
.b_cg
;
1003 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
) ||
1004 (cgp
->cg_cs
.cs_nbfree
== 0 && size
== fs
->fs_bsize
)) {
1008 blksfree
= cg_blksfree(cgp
);
1009 mutex_enter(&ufsvfsp
->vfs_lock
);
1010 cgp
->cg_time
= gethrestime_sec();
1011 if (size
== fs
->fs_bsize
) {
1012 if ((bno
= alloccgblk(ufsvfsp
, cgp
, bpref
, bp
)) == 0)
1015 ufs_notclean(ufsvfsp
);
1016 TRANS_SI(ufsvfsp
, fs
, cg
);
1021 * Check fragment bitmap to see if any fragments are already available.
1022 * mapsearch() may fail because the fragment that fits this request
1023 * might still be on the cancel list and not available for re-use yet.
1024 * Look for a bigger sized fragment to allocate first before we have
1025 * to give up and fragment a whole new block eventually.
1027 frags
= numfrags(fs
, size
);
1030 for (; allocsiz
< fs
->fs_frag
; allocsiz
++)
1031 if (cgp
->cg_frsum
[allocsiz
] != 0)
1034 if (allocsiz
!= fs
->fs_frag
) {
1035 bno
= mapsearch(ufsvfsp
, cgp
, bpref
, allocsiz
);
1036 if (bno
< 0 && allocsiz
< (fs
->fs_frag
- 1)) {
1042 if (allocsiz
== fs
->fs_frag
|| bno
< 0) {
1044 * No fragments were available, so a block
1045 * will be allocated and hacked up.
1047 if (cgp
->cg_cs
.cs_nbfree
== 0)
1049 if ((bno
= alloccgblk(ufsvfsp
, cgp
, bpref
, bp
)) == 0)
1051 bpref
= dtogd(fs
, bno
);
1052 for (i
= frags
; i
< fs
->fs_frag
; i
++)
1053 setbit(blksfree
, bpref
+ i
);
1054 i
= fs
->fs_frag
- frags
;
1055 cgp
->cg_cs
.cs_nffree
+= i
;
1056 fs
->fs_cstotal
.cs_nffree
+= i
;
1057 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
1060 ufs_notclean(ufsvfsp
);
1061 TRANS_SI(ufsvfsp
, fs
, cg
);
1066 for (i
= 0; i
< frags
; i
++)
1067 clrbit(blksfree
, bno
+ i
);
1068 cgp
->cg_cs
.cs_nffree
-= frags
;
1069 fs
->fs_cstotal
.cs_nffree
-= frags
;
1070 fs
->fs_cs(fs
, cg
).cs_nffree
-= frags
;
1071 cgp
->cg_frsum
[allocsiz
]--;
1072 ASSERT(cgp
->cg_frsum
[allocsiz
] >= 0);
1073 if (frags
!= allocsiz
) {
1074 cgp
->cg_frsum
[allocsiz
- frags
]++;
1077 ufs_notclean(ufsvfsp
);
1078 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1079 TRANS_SI(ufsvfsp
, fs
, cg
);
1081 return (cg
* fs
->fs_fpg
+ bno
);
1083 mutex_exit(&ufsvfsp
->vfs_lock
);
1089 * Allocate a block in a cylinder group.
1091 * This algorithm implements the following policy:
1092 * 1) allocate the requested block.
1093 * 2) allocate a rotationally optimal block in the same cylinder.
1094 * 3) allocate the next available block on the block rotor for the
1095 * specified cylinder group.
1096 * Note that this routine only allocates fs_bsize blocks; these
1097 * blocks may be fragmented by the routine that allocates them.
1101 struct ufsvfs
*ufsvfsp
,
1107 int cylno
, pos
, delta
, rotbl_size
;
1112 daddr_t blkno
, rpos
, frag
;
1116 ASSERT(MUTEX_HELD(&ufsvfsp
->vfs_lock
));
1117 fs
= ufsvfsp
->vfs_fs
;
1118 blksfree
= cg_blksfree(cgp
);
1120 bpref
= cgp
->cg_rotor
;
1123 bpref
= blknum(fs
, bpref
);
1124 bpref
= dtogd(fs
, bpref
);
1126 * If the requested block is available, use it.
1128 if (isblock(fs
, blksfree
, (daddr_t
)fragstoblks(fs
, bpref
))) {
1133 * Check for a block available on the same cylinder.
1135 cylno
= cbtocylno(fs
, bpref
);
1136 if (cg_blktot(cgp
)[cylno
] == 0)
1138 if (fs
->fs_cpc
== 0) {
1140 * Block layout info is not available, so just
1141 * have to take any block in this cylinder.
1143 bpref
= howmany(fs
->fs_spc
* cylno
, NSPF(fs
));
1147 * Check the summary information to see if a block is
1148 * available in the requested cylinder starting at the
1149 * requested rotational position and proceeding around.
1151 cylbp
= cg_blks(ufsvfsp
, cgp
, cylno
);
1152 pos
= cbtorpos(ufsvfsp
, bpref
);
1153 for (i
= pos
; i
< ufsvfsp
->vfs_nrpos
; i
++)
1156 if (i
== ufsvfsp
->vfs_nrpos
)
1157 for (i
= 0; i
< pos
; i
++)
1162 * Found a rotational position, now find the actual
1163 * block. A "panic" if none is actually there.
1167 * Up to this point, "pos" has referred to the rotational
1168 * position of the desired block. From now on, it holds
1169 * the offset of the current cylinder within a cylinder
1170 * cycle. (A cylinder cycle refers to a set of cylinders
1171 * which are described by a single rotational table; the
1172 * size of the cycle is fs_cpc.)
1174 * bno is set to the block number of the first block within
1175 * the current cylinder cycle.
1178 pos
= cylno
% fs
->fs_cpc
;
1179 bno
= (cylno
- pos
) * fs
->fs_spc
/ NSPB(fs
);
1182 * The blocks within a cylinder are grouped into equivalence
1183 * classes according to their "rotational position." There
1184 * are two tables used to determine these classes.
1186 * The positional offset table (fs_postbl) has an entry for
1187 * each rotational position of each cylinder in a cylinder
1188 * cycle. This entry contains the relative block number
1189 * (counting from the start of the cylinder cycle) of the
1190 * first block in the equivalence class for that position
1191 * and that cylinder. Positions for which no blocks exist
1192 * are indicated by a -1.
1194 * The rotational delta table (fs_rotbl) has an entry for
1195 * each block in a cylinder cycle. This entry contains
1196 * the offset from that block to the next block in the
1197 * same equivalence class. The last block in the class
1198 * is indicated by a zero in the table.
1200 * The following code, then, walks through all of the blocks
1201 * in the cylinder (cylno) which we're allocating within
1202 * which are in the equivalence class for the rotational
1203 * position (i) which we're allocating within.
1206 if (fs_postbl(ufsvfsp
, pos
)[i
] == -1) {
1207 (void) ufs_fault(ufsvfsp
->vfs_root
,
1208 "alloccgblk: cyl groups corrupted, pos = %d, "
1209 "i = %d, fs = %s\n", pos
, i
, fs
->fs_fsmnt
);
1214 * There is one entry in the rotational table for each block
1215 * in the cylinder cycle. These are whole blocks, not frags.
1218 rotbl_size
= (fs
->fs_cpc
* fs
->fs_spc
) >>
1219 (fs
->fs_fragshift
+ fs
->fs_fsbtodb
);
1222 * As we start, "i" is the rotational position within which
1223 * we're searching. After the next line, it will be a block
1224 * number (relative to the start of the cylinder cycle)
1225 * within the equivalence class of that rotational position.
1228 i
= fs_postbl(ufsvfsp
, pos
)[i
];
1231 if (isblock(fs
, blksfree
, (daddr_t
)(bno
+ i
))) {
1232 bno
= blkstofrags(fs
, (bno
+ i
));
1235 delta
= fs_rotbl(fs
)[i
];
1236 if (delta
<= 0 || /* End of chain, or */
1237 delta
+ i
> rotbl_size
) /* end of table? */
1238 break; /* If so, panic. */
1241 (void) ufs_fault(ufsvfsp
->vfs_root
,
1242 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1243 "fs:%s bno: %x\n", pos
, i
, fs
->fs_fsmnt
, (int)bno
);
1248 * No blocks in the requested cylinder, so take
1249 * next available one in this cylinder group.
1251 bno
= mapsearch(ufsvfsp
, cgp
, bpref
, (int)fs
->fs_frag
);
1254 cgp
->cg_rotor
= bno
;
1256 blkno
= fragstoblks(fs
, bno
);
1257 frag
= (cgp
->cg_cgx
* fs
->fs_fpg
) + bno
;
1258 if (TRANS_ISCANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, frag
)), fs
->fs_bsize
))
1260 clrblock(fs
, blksfree
, (long)blkno
);
1262 * the other cg/sb/si fields are TRANS'ed by the caller
1264 cgp
->cg_cs
.cs_nbfree
--;
1265 fs
->fs_cstotal
.cs_nbfree
--;
1266 fs
->fs_cs(fs
, cgp
->cg_cgx
).cs_nbfree
--;
1267 cylno
= cbtocylno(fs
, bno
);
1268 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
1269 rpos
= cbtorpos(ufsvfsp
, bno
);
1270 blktot
= cg_blktot(cgp
);
1273 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1279 * Determine whether an inode can be allocated.
1281 * Check to see if an inode is available, and if it is,
1282 * allocate it using the following policy:
1283 * 1) allocate the requested inode.
1284 * 2) allocate the next available inode after the requested
1285 * inode in the specified cylinder group.
1288 ialloccg(struct inode
*ip
, int cg
, daddr_t ipref
, int mode
)
1290 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
1291 struct fs
*fs
= ip
->i_fs
;
1294 int start
, len
, loc
, map
, i
;
1297 if (fs
->fs_cs(fs
, cg
).cs_nifree
== 0)
1299 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1300 (int)fs
->fs_cgsize
);
1302 cgp
= bp
->b_un
.b_cg
;
1303 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
) ||
1304 cgp
->cg_cs
.cs_nifree
== 0) {
1308 iused
= cg_inosused(cgp
);
1309 mutex_enter(&ufsvfsp
->vfs_lock
);
1311 * While we are waiting for the mutex, someone may have taken
1312 * the last available inode. Need to recheck.
1314 if (cgp
->cg_cs
.cs_nifree
== 0) {
1315 mutex_exit(&ufsvfsp
->vfs_lock
);
1320 cgp
->cg_time
= gethrestime_sec();
1322 ipref
%= fs
->fs_ipg
;
1323 if (isclr(iused
, ipref
))
1326 start
= cgp
->cg_irotor
/ NBBY
;
1327 len
= howmany(fs
->fs_ipg
- cgp
->cg_irotor
, NBBY
);
1328 loc
= skpc(0xff, (uint_t
)len
, &iused
[start
]);
1332 loc
= skpc(0xff, (uint_t
)len
, &iused
[0]);
1334 mutex_exit(&ufsvfsp
->vfs_lock
);
1335 (void) ufs_fault(ITOV(ip
),
1336 "ialloccg: map corrupted, cg = %d, irotor = %d, "
1337 "fs = %s\n", cg
, (int)cgp
->cg_irotor
, fs
->fs_fsmnt
);
1341 i
= start
+ len
- loc
;
1344 for (i
= 1; i
< (1 << NBBY
); i
<<= 1, ipref
++) {
1345 if ((map
& i
) == 0) {
1346 cgp
->cg_irotor
= ipref
;
1351 mutex_exit(&ufsvfsp
->vfs_lock
);
1352 (void) ufs_fault(ITOV(ip
), "ialloccg: block not in mapfs = %s",
1356 setbit(iused
, ipref
);
1357 cgp
->cg_cs
.cs_nifree
--;
1358 fs
->fs_cstotal
.cs_nifree
--;
1359 fs
->fs_cs(fs
, cg
).cs_nifree
--;
1360 if (((mode
& IFMT
) == IFDIR
) || ((mode
& IFMT
) == IFATTRDIR
)) {
1361 cgp
->cg_cs
.cs_ndir
++;
1362 fs
->fs_cstotal
.cs_ndir
++;
1363 fs
->fs_cs(fs
, cg
).cs_ndir
++;
1366 ufs_notclean(ufsvfsp
);
1367 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1368 TRANS_SI(ufsvfsp
, fs
, cg
);
1370 return (cg
* fs
->fs_ipg
+ ipref
);
1374 * Find a block of the specified size in the specified cylinder group.
1376 * It is a panic if a request is made to find a block if none are
1380 mapsearch(struct ufsvfs
*ufsvfsp
, struct cg
*cgp
, daddr_t bpref
,
1383 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1385 int start
, len
, loc
, i
, last
, first
, secondtime
;
1386 int blk
, field
, subfield
, pos
;
1390 * ufsvfs->vfs_lock is held when calling this.
1393 * Find the fragment by searching through the
1394 * free block map for an appropriate bit pattern.
1397 start
= dtogd(fs
, bpref
) / NBBY
;
1399 start
= cgp
->cg_frotor
/ NBBY
;
1401 * the following loop performs two scans -- the first scan
1402 * searches the bottom half of the array for a match and the
1403 * second scan searches the top half of the array. The loops
1404 * have been merged just to make things difficult.
1407 last
= howmany(fs
->fs_fpg
, NBBY
);
1409 cfrag
= cgp
->cg_cgx
* fs
->fs_fpg
;
1410 while (first
< last
) {
1413 * search the array for a match
1415 loc
= scanc((unsigned)len
, (uchar_t
*)&cg_blksfree(cgp
)[first
],
1416 (uchar_t
*)fragtbl
[fs
->fs_frag
],
1417 (int)(1 << (allocsiz
- 1 + (fs
->fs_frag
% NBBY
))));
1422 bno
= (last
- loc
) * NBBY
;
1425 * Found the byte in the map, sift
1426 * through the bits to find the selected frag
1428 cgp
->cg_frotor
= bno
;
1430 for (i
= bno
+ NBBY
; bno
< i
; bno
+= fs
->fs_frag
) {
1431 blk
= blkmap(fs
, cg_blksfree(cgp
), bno
);
1433 field
= around
[allocsiz
];
1434 subfield
= inside
[allocsiz
];
1436 pos
<= fs
->fs_frag
- allocsiz
;
1438 if ((blk
& field
) == subfield
) {
1451 * success if block is *not* being converted from
1452 * metadata into userdata (harpy). If so, ignore.
1454 if (!TRANS_ISCANCEL(ufsvfsp
,
1455 ldbtob(fsbtodb(fs
, (cfrag
+bno
))),
1456 allocsiz
* fs
->fs_fsize
))
1460 * keep looking -- this block is being converted
1462 first
= (last
- loc
) + 1;
1468 * no usable matches in bottom half -- now search the top half
1472 * no usable matches in top half -- all done
1482 return ((daddr_t
)-1);
1485 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */
1486 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */
1487 #define SINGLE 0 /* single indirect block ptr */
1488 #define DOUBLE 1 /* double indirect block ptr */
1489 #define TRIPLE 2 /* triple indirect block ptr */
1492 * Acquire a write lock, and keep trying till we get it
1495 allocsp_wlockfs(struct vnode
*vp
, struct lockfs
*lf
)
1501 err
= ufs_fiolfss(vp
, lf
);
1504 } while (!LOCKFS_IS_ULOCK(lf
));
1506 lf
->lf_lock
= LOCKFS_WLOCK
;
1508 lf
->lf_comment
= NULL
;
1509 err
= ufs__fiolfs(vp
, lf
, 1, 0);
1511 if (err
== EBUSY
|| err
== EINVAL
)
1518 * Release the write lock
1521 allocsp_unlockfs(struct vnode
*vp
, struct lockfs
*lf
)
1525 lf
->lf_lock
= LOCKFS_ULOCK
;
1527 err
= ufs__fiolfs(vp
, lf
, 1, 0);
1531 struct allocsp_undo
{
1534 struct allocsp_undo
*next
;
1538 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1539 * file system. For direct blocks, the blocks are allocated from the offset
1540 * requested to the block boundary, then any full blocks are allocated,
1541 * and finally any remainder.
1542 * For indirect blocks the blocks are not initialized and are
1543 * only marked as allocated. These addresses are then stored as negative
1544 * block numbers in the inode to imply special handling. UFS has been modified
1545 * where necessary to understand this new notion.
1546 * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1549 ufs_allocsp(struct vnode
*vp
, struct flock64
*lp
, cred_t
*cr
)
1552 int berr
, err
, resv
, issync
;
1553 off_t istart
, len
; /* istart, special for idb */
1556 struct ufsvfs
*ufsvfsp
;
1557 uoff_t resid
, i
, uoff
;
1558 daddr32_t db_undo
[NDADDR
]; /* old direct blocks */
1559 struct allocsp_undo
*ib_undo
= NULL
; /* ib undo */
1560 struct allocsp_undo
*undo
= NULL
;
1561 uoff_t osz
; /* old file size */
1562 int chunkblks
= 0; /* # of blocks in 1 allocation */
1565 daddr_t totblks
= 0;
1566 struct ulockfs
*ulp
;
1568 int nbytes
, offsetn
;
1571 ASSERT(vp
->v_type
== VREG
);
1575 if ((ufsvfsp
= ip
->i_ufsvfs
) == NULL
) {
1580 istart
= blkroundup(fs
, (lp
->l_start
));
1581 len
= blkroundup(fs
, (lp
->l_len
));
1582 chunkblks
= blkroundup(fs
, ufsvfsp
->vfs_iotransz
) / fs
->fs_bsize
;
1583 ulp
= &ufsvfsp
->vfs_ulockfs
;
1585 if (lp
->l_start
< 0 || lp
->l_len
<= 0)
1588 /* Quickly check to make sure we have space before we proceed */
1589 if (lblkno(fs
, len
) > fs
->fs_cstotal
.cs_nbfree
) {
1590 if (TRANS_ISTRANS(ufsvfsp
)) {
1591 ufs_delete_drain_wait(ufsvfsp
, 1);
1592 if (lblkno(fs
, len
) > fs
->fs_cstotal
.cs_nbfree
)
1599 * We will keep i_rwlock locked as WRITER through out the function
1600 * since we don't want anyone else reading or writing to the inode
1601 * while we are in the middle of fallocating the file.
1603 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1605 /* Back up the direct block list, used for undo later if necessary */
1606 rw_enter(&ip
->i_contents
, RW_READER
);
1607 for (i
= 0; i
< NDADDR
; i
++)
1608 db_undo
[i
] = ip
->i_db
[i
];
1610 rw_exit(&ip
->i_contents
);
1612 /* Write lock the file system */
1613 if (err
= allocsp_wlockfs(vp
, &lf
))
1617 * Allocate any direct blocks now.
1618 * Blocks are allocated from the offset requested to the block
1619 * boundary, then any full blocks are allocated, and finally any
1622 if (lblkno(fs
, lp
->l_start
) < NDADDR
) {
1623 ufs_trans_trunc_resv(ip
, ip
->i_size
+ (NDADDR
* fs
->fs_bsize
),
1625 TRANS_BEGIN_CSYNC(ufsvfsp
, &issync
, TOP_ALLOCSP
, resv
);
1627 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1628 rw_enter(&ip
->i_contents
, RW_WRITER
);
1631 while ((done_len
< lp
->l_len
) &&
1632 (lblkno(fs
, lp
->l_start
+ done_len
) < NDADDR
)) {
1633 uoff
= (offset_t
)(lp
->l_start
+ done_len
);
1634 offsetn
= (int)blkoff(fs
, uoff
);
1635 nbytes
= (int)MIN(fs
->fs_bsize
- offsetn
,
1636 lp
->l_len
- done_len
);
1638 berr
= bmap_write(ip
, uoff
, offsetn
+ nbytes
,
1639 BI_FALLOCATE
, &allocblk
, cr
);
1640 /* Yikes error, quit */
1642 TRANS_INODE(ufsvfsp
, ip
);
1643 rw_exit(&ip
->i_contents
);
1644 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1645 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
,
1647 err
= allocsp_unlockfs(vp
, &lf
);
1653 if ((uoff
+ nbytes
) > ip
->i_size
)
1654 ip
->i_size
= (uoff
+ nbytes
);
1659 TRANS_INODE(ufsvfsp
, ip
);
1660 rw_exit(&ip
->i_contents
);
1661 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1662 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_ALLOCSP
, resv
);
1664 /* start offset for indirect allocation */
1665 istart
= (uoff
+ nbytes
);
1668 /* Break the transactions into vfs_iotransz units */
1669 ufs_trans_trunc_resv(ip
, ip
->i_size
+
1670 blkroundup(fs
, ufsvfsp
->vfs_iotransz
), &resv
, &resid
);
1671 TRANS_BEGIN_CSYNC(ufsvfsp
, &issync
, TOP_ALLOCSP
, resv
);
1673 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1674 rw_enter(&ip
->i_contents
, RW_WRITER
);
1676 /* Now go about fallocating necessary indirect blocks */
1677 for (i
= istart
; i
< (lp
->l_start
+ lp
->l_len
); i
+= fs
->fs_bsize
) {
1678 berr
= bmap_write(ip
, i
, fs
->fs_bsize
, BI_FALLOCATE
,
1681 TRANS_INODE(ufsvfsp
, ip
);
1682 rw_exit(&ip
->i_contents
);
1683 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1684 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_ALLOCSP
,
1686 err
= allocsp_unlockfs(vp
, &lf
);
1690 /* Update the blk counter only if new block was added */
1692 /* Save undo information */
1693 undo
= kmem_alloc(sizeof (struct allocsp_undo
),
1696 undo
->blk
= allocblk
;
1697 undo
->next
= ib_undo
;
1701 if (i
>= ip
->i_size
)
1702 ip
->i_size
+= fs
->fs_bsize
;
1706 /* Being a good UFS citizen, let others get a share */
1707 if (cnt
== chunkblks
) {
1709 * If there are waiters or the fs is hard locked,
1710 * error locked, or read-only error locked,
1713 if (ULOCKFS_IS_HLOCK(ulp
) || ULOCKFS_IS_ELOCK(ulp
) ||
1714 ULOCKFS_IS_ROELOCK(ulp
)) {
1715 ip
->i_cflags
|= IFALLOCATE
;
1716 TRANS_INODE(ufsvfsp
, ip
);
1717 rw_exit(&ip
->i_contents
);
1718 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1720 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
,
1722 rw_exit(&ip
->i_rwlock
);
1723 (void) allocsp_unlockfs(vp
, &lf
);
1727 TRANS_INODE(ufsvfsp
, ip
);
1728 rw_exit(&ip
->i_contents
);
1729 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1731 /* End the current transaction */
1732 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_ALLOCSP
,
1735 if (CV_HAS_WAITERS(&ulp
->ul_cv
)) {
1736 /* Release the write lock */
1737 if (err
= allocsp_unlockfs(vp
, &lf
))
1740 /* Wake up others waiting to do operations */
1741 mutex_enter(&ulp
->ul_lock
);
1742 cv_broadcast(&ulp
->ul_cv
);
1743 mutex_exit(&ulp
->ul_lock
);
1745 /* Grab the write lock again */
1746 if (err
= allocsp_wlockfs(vp
, &lf
))
1748 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1750 /* Reserve more space in log for this file */
1751 ufs_trans_trunc_resv(ip
,
1752 ip
->i_size
+ blkroundup(fs
, ufsvfsp
->vfs_iotransz
),
1754 TRANS_BEGIN_CSYNC(ufsvfsp
, &issync
, TOP_ALLOCSP
, resv
);
1756 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1757 rw_enter(&ip
->i_contents
, RW_WRITER
);
1759 cnt
= 0; /* reset cnt b/c of new transaction */
1764 ip
->i_cflags
|= IFALLOCATE
;
1766 /* If the file has grown then correct the file size */
1767 if (osz
< (lp
->l_start
+ lp
->l_len
))
1768 ip
->i_size
= (lp
->l_start
+ lp
->l_len
);
1770 /* Release locks, end log transaction and unlock fs */
1771 TRANS_INODE(ufsvfsp
, ip
);
1772 rw_exit(&ip
->i_contents
);
1773 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1775 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_ALLOCSP
, resv
);
1776 err
= allocsp_unlockfs(vp
, &lf
);
1779 * @ exit label, we should no longer be holding the fs write lock, and
1780 * all logging transactions should have been ended. We still hold
1785 * File has grown larger than 2GB. Set flag
1786 * in superblock to indicate this, if it
1787 * is not already set.
1789 if ((ip
->i_size
> MAXOFF32_T
) &&
1790 !(fs
->fs_flags
& FSLARGEFILES
)) {
1791 ASSERT(ufsvfsp
->vfs_lfflags
& UFS_LARGEFILES
);
1792 mutex_enter(&ufsvfsp
->vfs_lock
);
1793 fs
->fs_flags
|= FSLARGEFILES
;
1794 ufs_sbwrite(ufsvfsp
);
1795 mutex_exit(&ufsvfsp
->vfs_lock
);
1799 * Since we couldn't allocate completely, we will undo the allocations.
1802 ufs_trans_trunc_resv(ip
, totblks
* fs
->fs_bsize
, &resv
, &resid
);
1803 TRANS_BEGIN_CSYNC(ufsvfsp
, &issync
, TOP_ALLOCSP
, resv
);
1805 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1806 rw_enter(&ip
->i_contents
, RW_WRITER
);
1809 for (i
= 0; i
< NDADDR
; i
++) {
1811 * Only free the block if they are not same, and
1812 * the old one isn't zero (the fragment was
1815 if (db_undo
[i
] != ip
->i_db
[i
] && db_undo
[i
] == 0) {
1816 free(ip
, ip
->i_db
[i
], fs
->fs_bsize
, 0);
1821 /* Undo the indirect blocks */
1822 while (ib_undo
!= NULL
) {
1824 err
= bmap_set_bn(vp
, undo
->offset
, 0);
1826 cmn_err(CE_PANIC
, "ufs_allocsp(): failed to "
1827 "undo allocation of block %ld",
1829 free(ip
, undo
->blk
, fs
->fs_bsize
, I_IBLK
);
1830 ib_undo
= undo
->next
;
1831 kmem_free(undo
, sizeof (struct allocsp_undo
));
1835 TRANS_INODE(ufsvfsp
, ip
);
1837 rw_exit(&ip
->i_contents
);
1838 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1840 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_ALLOCSP
, resv
);
1842 rw_exit(&ip
->i_rwlock
);
1847 * Don't forget to free the undo chain :)
1849 while (ib_undo
!= NULL
) {
1851 ib_undo
= undo
->next
;
1852 kmem_free(undo
, sizeof (struct allocsp_undo
));
1855 rw_exit(&ip
->i_rwlock
);
1862 * Free storage space associated with the specified inode. The portion
1863 * to be freed is specified by lp->l_start and lp->l_len (already
1864 * normalized to a "whence" of 0).
1866 * This is an experimental facility whose continued existence is not
1867 * guaranteed. Currently, we only support the special case
1868 * of l_len == 0, meaning free to end of file.
1870 * Blocks are freed in reverse order. This FILO algorithm will tend to
1871 * maintain a contiguous free list much longer than FIFO.
1872 * See also ufs_itrunc() in ufs_inode.c.
1874 * Bug: unused bytes in the last retained block are not cleared.
1875 * This may result in a "hole" in the file that does not read as zeroes.
1879 ufs_freesp(struct vnode
*vp
, struct flock64
*lp
, int flag
, cred_t
*cr
)
1882 struct inode
*ip
= VTOI(vp
);
1885 ASSERT(vp
->v_type
== VREG
);
1886 ASSERT(lp
->l_start
>= 0); /* checked by convoff */
1891 rw_enter(&ip
->i_contents
, RW_READER
);
1892 if (ip
->i_size
== (uoff_t
)lp
->l_start
) {
1893 rw_exit(&ip
->i_contents
);
1898 * Check if there is any active mandatory lock on the
1899 * range that will be truncated/expanded.
1901 if (MANDLOCK(vp
, ip
->i_mode
)) {
1902 offset_t save_start
;
1904 save_start
= lp
->l_start
;
1906 if (ip
->i_size
< lp
->l_start
) {
1908 * "Truncate up" case: need to make sure there
1909 * is no lock beyond current end-of-file. To
1910 * do so, we need to set l_start to the size
1911 * of the file temporarily.
1913 lp
->l_start
= ip
->i_size
;
1915 lp
->l_type
= F_WRLCK
;
1917 lp
->l_pid
= ttoproc(curthread
)->p_pid
;
1918 i
= (flag
& (FNDELAY
|FNONBLOCK
)) ? 0 : SLPFLCK
;
1919 rw_exit(&ip
->i_contents
);
1920 if ((i
= reclock(vp
, lp
, i
, 0, lp
->l_start
, NULL
)) != 0 ||
1921 lp
->l_type
!= F_UNLCK
) {
1922 return (i
? i
: EAGAIN
);
1924 rw_enter(&ip
->i_contents
, RW_READER
);
1926 lp
->l_start
= save_start
;
1930 * Make sure a write isn't in progress (allocating blocks)
1931 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1932 * truncate while it was allocating blocks).
1933 * Grab the locks in the right order.
1935 rw_exit(&ip
->i_contents
);
1936 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1937 error
= TRANS_ITRUNC(ip
, (uoff_t
)lp
->l_start
, 0, cr
);
1938 rw_exit(&ip
->i_rwlock
);
1943 * Find a cg with as close to nb contiguous bytes as possible
1944 * THIS MAY TAKE MANY DISK READS!
1946 * Implemented in an attempt to allocate contiguous blocks for
1947 * writing the ufs log file to, minimizing future disk head seeking
1950 contigpref(ufsvfs_t
*ufsvfsp
, size_t nb
, size_t minb
)
1952 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1953 daddr_t nblk
= lblkno(fs
, blkroundup(fs
, nb
));
1954 daddr_t minblk
= lblkno(fs
, blkroundup(fs
, minb
));
1955 daddr_t savebno
, curbno
, cgbno
;
1956 int cg
, cgblks
, savecg
, savenblk
, curnblk
, startcg
;
1965 if ((startcg
= findlogstartcg(fs
, nblk
, minblk
)) == -1)
1966 cg
= 0; /* Nothing suitable found */
1970 for (; cg
< fs
->fs_ncg
; ++cg
) {
1972 * find the largest contiguous range in this cg
1974 bp
= UFS_BREAD(ufsvfsp
, ufsvfsp
->vfs_dev
,
1975 (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1976 (int)fs
->fs_cgsize
);
1977 cgp
= bp
->b_un
.b_cg
;
1978 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
1982 blksfree
= cg_blksfree(cgp
); /* free array */
1983 cgblks
= fragstoblks(fs
, fs
->fs_fpg
); /* blks in free array */
1985 while (cgbno
< cgblks
&& savenblk
< nblk
) {
1986 /* find a free block */
1987 for (; cgbno
< cgblks
; ++cgbno
) {
1988 if (isblock(fs
, blksfree
, cgbno
)) {
1989 if (startcg
!= -1) {
1999 /* count the number of free blocks */
2000 for (curnblk
= 0; cgbno
< cgblks
; ++cgbno
) {
2001 if (!isblock(fs
, blksfree
, cgbno
))
2003 if (++curnblk
>= nblk
)
2006 if (curnblk
> savenblk
) {
2013 if (savenblk
>= nblk
)
2019 /* convert block offset in cg to frag offset in cg */
2020 savebno
= blkstofrags(fs
, savebno
);
2022 /* convert frag offset in cg to frag offset in fs */
2023 savebno
+= (savecg
* fs
->fs_fpg
);
2029 * The object of this routine is to find a start point for the UFS log.
2030 * Ideally the space should be allocated from the smallest possible number
2031 * of contiguous cylinder groups. This is found by using a sliding window
2032 * technique. The smallest window of contiguous cylinder groups, which is
2033 * still able to accommodate the target, is found by moving the window
2034 * through the cylinder groups in a single pass. The end of the window is
2035 * advanced until the space is accommodated, then the start is advanced until
2036 * it no longer fits, the end is then advanced again and so on until the
2037 * final cylinder group is reached. The first suitable instance is recorded
2038 * and its starting cg number is returned.
2040 * If we are not able to find a minimum amount of space, represented by
2041 * minblk, or to do so uses more than the available extents, then return -1.
2045 findlogstartcg(struct fs
*fs
, daddr_t requested
, daddr_t minblk
)
2047 int ncgs
; /* number of cylinder groups */
2048 daddr_t target
; /* amount of space sought */
2049 int cwidth
, ctotal
; /* current window width and total */
2050 int bwidth
, btotal
; /* best window width and total so far */
2051 int s
; /* index of the first element in the current window */
2052 int e
; /* index of the first element + the width */
2053 /* (i.e. 1 + index of last element) */
2054 int bs
; /* index of the first element in the best window so far */
2055 int header
, max_extents
;
2060 header
= sizeof (extent_block_t
) - sizeof (extent_t
);
2061 max_extents
= ((fs
->fs_bsize
)-header
) / sizeof (extent_t
);
2062 cwidth
= ctotal
= 0;
2067 /* Advance the end of the window until it accommodates the target. */
2068 while (ctotal
< target
&& e
< ncgs
) {
2069 ctotal
+= fs
->fs_cs(fs
, e
).cs_nbfree
;
2074 * Advance the start of the window until it no longer
2075 * accommodates the target.
2077 while (ctotal
>= target
&& s
< e
) {
2078 /* See if this is the smallest window so far. */
2080 if (cwidth
<= bwidth
) {
2081 if (cwidth
== bwidth
&& ctotal
<= btotal
)
2088 ctotal
-= fs
->fs_cs(fs
, s
).cs_nbfree
;
2094 * If we cannot allocate the minimum required or we use too many
2095 * extents to do so, return -1.
2097 if (btotal
< minblk
|| bwidth
> max_extents
)