4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 #include <sys/condvar_impl.h>
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/debug.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/signal.h>
52 #include <sys/vnode.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_inode.h>
56 #include <sys/fs/ufs_acl.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <sys/fs/ufs_quota.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/errno.h>
64 #include <sys/sysmacros.h>
66 #include <sys/fcntl.h>
67 #include <sys/flock.h>
68 #include <fs/fs_subr.h>
69 #include <sys/cmn_err.h>
70 #include <sys/policy.h>
71 #include <sys/fs/ufs_log.h>
73 static ino_t
hashalloc();
74 static daddr_t
fragextend();
75 static daddr_t
alloccg();
76 static daddr_t
alloccgblk();
77 static ino_t
ialloccg();
78 static daddr_t
mapsearch();
79 static int findlogstartcg();
81 extern int inside
[], around
[];
82 extern uchar_t
*fragtbl
[];
86 * Allocate a block in the file system.
88 * The size of the requested block is given, which must be some
89 * multiple of fs_fsize and <= fs_bsize.
90 * A preference may be optionally specified. If a preference is given
91 * the following hierarchy is used to allocate a block:
92 * 1) allocate the requested block.
93 * 2) allocate a rotationally optimal block in the same cylinder.
94 * 3) allocate a block in the same cylinder group.
95 * 4) quadratically rehash into other cylinder groups, until an
96 * available block is located.
97 * If no block preference is given the following hierarchy is used
98 * to allocate a block:
99 * 1) allocate a block in the cylinder group that contains the
100 * inode for the file.
101 * 2) quadratically rehash into other cylinder groups, until an
102 * available block is located.
105 alloc(struct inode
*ip
, daddr_t bpref
, int size
, daddr_t
*bnp
, cred_t
*cr
)
108 struct ufsvfs
*ufsvfsp
;
116 ufsvfsp
= ip
->i_ufsvfs
;
117 fs
= ufsvfsp
->vfs_fs
;
118 if ((unsigned)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
119 err
= ufs_fault(ITOV(ip
), "alloc: bad size, dev = 0x%lx,"
120 " bsize = %d, size = %d, fs = %s\n",
121 ip
->i_dev
, fs
->fs_bsize
, size
, fs
->fs_fsmnt
);
124 if (size
== fs
->fs_bsize
&& fs
->fs_cstotal
.cs_nbfree
== 0)
126 if (freespace(fs
, ufsvfsp
) <= 0 &&
127 secpolicy_fs_minfree(cr
, ufsvfsp
->vfs_vfs
) != 0)
129 err
= chkdq(ip
, (long)btodb(size
), 0, cr
, &errmsg
, &len
);
130 /* Note that may not have err, but may have errmsg */
131 if (errmsg
!= NULL
) {
133 kmem_free(errmsg
, len
);
138 if (bpref
>= fs
->fs_size
)
141 cg
= (int)itog(fs
, ip
->i_number
);
143 cg
= dtog(fs
, bpref
);
145 bno
= (daddr_t
)hashalloc(ip
, cg
, (long)bpref
, size
,
146 (ulong_t (*)())alloccg
);
153 * hashalloc() failed because some other thread grabbed
154 * the last block so unwind the quota operation. We can
155 * ignore the return because subtractions don't fail and
156 * size is guaranteed to be >= zero by our caller.
158 (void) chkdq(ip
, -(long)btodb(size
), 0, cr
, (char **)NULL
,
162 now
= ddi_get_lbolt();
163 mutex_enter(&ufsvfsp
->vfs_lock
);
164 if ((now
- ufsvfsp
->vfs_lastwhinetime
) > (hz
<< 2) &&
165 (!(TRANS_ISTRANS(ufsvfsp
)) || !(ip
->i_flag
& IQUIET
))) {
166 ufsvfsp
->vfs_lastwhinetime
= now
;
167 cmn_err(CE_NOTE
, "alloc: %s: file system full", fs
->fs_fsmnt
);
169 mutex_exit(&ufsvfsp
->vfs_lock
);
174 * Reallocate a fragment to a bigger size
176 * The number and size of the old block is given, and a preference
177 * and new size is also specified. The allocator attempts to extend
178 * the original block. Failing that, the regular block allocator is
179 * invoked to get an appropriate block.
182 realloccg(struct inode
*ip
, daddr_t bprev
, daddr_t bpref
, int osize
,
183 int nsize
, daddr_t
*bnp
, cred_t
*cr
)
187 struct ufsvfs
*ufsvfsp
;
194 ufsvfsp
= ip
->i_ufsvfs
;
195 fs
= ufsvfsp
->vfs_fs
;
196 if ((unsigned)osize
> fs
->fs_bsize
|| fragoff(fs
, osize
) != 0 ||
197 (unsigned)nsize
> fs
->fs_bsize
|| fragoff(fs
, nsize
) != 0) {
198 err
= ufs_fault(ITOV(ip
),
199 "realloccg: bad size, dev=0x%lx, bsize=%d, "
200 "osize=%d, nsize=%d, fs=%s\n",
201 ip
->i_dev
, fs
->fs_bsize
, osize
, nsize
, fs
->fs_fsmnt
);
204 if (freespace(fs
, ufsvfsp
) <= 0 &&
205 secpolicy_fs_minfree(cr
, ufsvfsp
->vfs_vfs
) != 0)
208 err
= ufs_fault(ITOV(ip
),
209 "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
210 " bprev = %ld, fs = %s\n", ip
->i_dev
, fs
->fs_bsize
, bprev
,
214 err
= chkdq(ip
, (long)btodb(nsize
- osize
), 0, cr
, &errmsg
, &len
);
215 /* Note that may not have err, but may have errmsg */
216 if (errmsg
!= NULL
) {
218 kmem_free(errmsg
, len
);
223 cg
= dtog(fs
, bprev
);
224 bno
= fragextend(ip
, cg
, (long)bprev
, osize
, nsize
);
229 if (bpref
>= fs
->fs_size
)
233 * When optimizing for time we allocate a full block and
234 * then only use the upper portion for this request. When
235 * this file grows again it will grow into the unused portion
236 * of the block (See fragextend() above). This saves time
237 * because an extra disk write would be needed if the frags
238 * following the current allocation were not free. The extra
239 * disk write is needed to move the data from its current
240 * location into the newly allocated position.
242 * When optimizing for space we allocate a run of frags
243 * that is just the right size for this request.
245 request
= (fs
->fs_optim
== FS_OPTTIME
) ? fs
->fs_bsize
: nsize
;
246 bno
= (daddr_t
)hashalloc(ip
, cg
, (long)bpref
, request
,
247 (ulong_t (*)())alloccg
);
251 (void) free(ip
, bno
+ numfrags(fs
, nsize
),
252 (off_t
)(request
- nsize
), I_NOCANCEL
);
257 * hashalloc() failed because some other thread grabbed
258 * the last block so unwind the quota operation. We can
259 * ignore the return because subtractions don't fail, and
260 * our caller guarantees nsize >= osize.
262 (void) chkdq(ip
, -(long)btodb(nsize
- osize
), 0, cr
, (char **)NULL
,
266 now
= ddi_get_lbolt();
267 mutex_enter(&ufsvfsp
->vfs_lock
);
268 if ((now
- ufsvfsp
->vfs_lastwhinetime
) > (hz
<< 2) &&
269 (!(TRANS_ISTRANS(ufsvfsp
)) || !(ip
->i_flag
& IQUIET
))) {
270 ufsvfsp
->vfs_lastwhinetime
= now
;
272 "realloccg %s: file system full", fs
->fs_fsmnt
);
274 mutex_exit(&ufsvfsp
->vfs_lock
);
279 * Allocate an inode in the file system.
281 * A preference may be optionally specified. If a preference is given
282 * the following hierarchy is used to allocate an inode:
283 * 1) allocate the requested inode.
284 * 2) allocate an inode in the same cylinder group.
285 * 3) quadratically rehash into other cylinder groups, until an
286 * available inode is located.
287 * If no inode preference is given the following hierarchy is used
288 * to allocate an inode:
289 * 1) allocate an inode in cylinder group 0.
290 * 2) quadratically rehash into other cylinder groups, until an
291 * available inode is located.
294 ufs_ialloc(struct inode
*pip
,
295 ino_t ipref
, mode_t mode
, struct inode
**ipp
, cred_t
*cr
)
303 struct ufsvfs
*ufsvfsp
= pip
->i_ufsvfs
;
307 ASSERT(RW_WRITE_HELD(&pip
->i_rwlock
));
310 nifree
= fs
->fs_cstotal
.cs_nifree
;
315 * Shadow inodes don't count against a user's inode allocation.
316 * They are an implementation method and not a resource.
318 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
319 err
= chkiq((struct ufsvfs
*)ITOV(pip
)->v_vfsp
->vfs_data
,
320 /* change */ 1, (struct inode
*)NULL
, crgetuid(cr
), 0,
323 * As we haven't acquired any locks yet, dump the message
326 if (errmsg
!= NULL
) {
328 kmem_free(errmsg
, len
);
335 if (ipref
>= (ulong_t
)(fs
->fs_ncg
* fs
->fs_ipg
))
337 cg
= (int)itog(fs
, ipref
);
338 ino
= (ino_t
)hashalloc(pip
, cg
, (long)ipref
, (int)mode
,
339 (ulong_t (*)())ialloccg
);
341 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
343 * We can safely ignore the return from chkiq()
344 * because deallocations can only fail if we
345 * can't get the user's quota info record off
346 * the disk due to an I/O error. In that case,
347 * the quota subsystem is already messed up.
349 (void) chkiq(ufsvfsp
, /* change */ -1,
350 (struct inode
*)NULL
, crgetuid(cr
), 0, cr
,
351 (char **)NULL
, (size_t *)NULL
);
355 err
= ufs_iget(pip
->i_vfs
, ino
, ipp
, cr
);
357 if ((mode
!= IFSHAD
) && (mode
!= IFATTRDIR
)) {
359 * See above comment about why it is safe to ignore an
362 (void) chkiq(ufsvfsp
, /* change */ -1,
363 (struct inode
*)NULL
, crgetuid(cr
), 0, cr
,
364 (char **)NULL
, (size_t *)NULL
);
366 ufs_ifree(pip
, ino
, 0);
370 ASSERT(!ip
->i_ufs_acl
);
371 ASSERT(!ip
->i_dquot
);
372 rw_enter(&ip
->i_contents
, RW_WRITER
);
375 * Check if we really got a free inode, if not then complain
376 * and mark the inode ISTALE so that it will be freed by the
377 * ufs idle thread eventually and will not be sent to ufs_delete().
379 if (ip
->i_mode
|| (ip
->i_nlink
> 0)) {
380 ip
->i_flag
|= ISTALE
;
381 rw_exit(&ip
->i_contents
);
384 "%s: unexpected allocated inode %d, run fsck(1M)%s",
385 fs
->fs_fsmnt
, (int)ino
,
386 (TRANS_ISTRANS(ufsvfsp
) ? " -o f" : ""));
391 * Check the inode has no size or data blocks.
392 * This could have happened if the truncation failed when
393 * deleting the inode. It used to be possible for this to occur
394 * if a block allocation failed when iteratively truncating a
395 * large file using logging and with a full file system.
396 * This was fixed with bug fix 4348738. However, truncation may
397 * still fail on an IO error. So in all cases for safety and
398 * security we clear out the size; the blocks allocated; and
399 * pointers to the blocks. This will ultimately cause a fsck
400 * error of un-accounted for blocks, but its a fairly benign error,
401 * and possibly the correct thing to do anyway as accesssing those
402 * blocks agains may lead to more IO errors.
404 if (ip
->i_size
|| ip
->i_blocks
) {
409 "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
410 fs
->fs_fsmnt
, (int)ino
, ip
->i_size
,
411 (TRANS_ISTRANS(ufsvfsp
) ? " -o f" : ""));
414 * Clear any garbage left behind.
416 ip
->i_size
= (u_offset_t
)0;
418 for (i
= 0; i
< NDADDR
; i
++)
420 for (i
= 0; i
< NIADDR
; i
++)
425 * Initialize the link count
430 * Clear the old flags
435 * Access times are not really defined if the fs is mounted
436 * with 'noatime'. But it can cause nfs clients to fail
437 * open() if the atime is not a legal value. Set a legal value
438 * here when the inode is allocated.
440 if (ufsvfsp
->vfs_noatime
) {
441 mutex_enter(&ufs_iuniqtime_lock
);
442 ip
->i_atime
= iuniqtime
;
443 mutex_exit(&ufs_iuniqtime_lock
);
445 rw_exit(&ip
->i_contents
);
448 if (!(TRANS_ISTRANS(ufsvfsp
)) || !(pip
->i_flag
& IQUIET
))
449 cmn_err(CE_NOTE
, "%s: out of inodes\n", fs
->fs_fsmnt
);
454 * Find a cylinder group to place a directory.
455 * Returns an inumber within the selected cylinder group.
456 * Note, the vfs_lock is not needed as we don't require exact cg summary info.
458 * If the switch ufs_close_dirs is set, then the policy is to use
459 * the current cg if it has more than 25% free inodes and more
460 * than 25% free blocks. Otherwise the cgs are searched from
461 * the beginning and the first cg with the same criteria is
462 * used. If that is also null then we revert to the old algorithm.
463 * This tends to cluster files at the beginning of the disk
464 * until the disk gets full.
466 * Otherwise if ufs_close_dirs is not set then the original policy is
467 * used which is to select from among those cylinder groups with
468 * above the average number of free inodes, the one with the smallest
469 * number of directories.
472 int ufs_close_dirs
= 1; /* allocate directories close as possible */
477 int cg
, minndir
, mincg
, avgifree
, mininode
, minbpg
, ifree
;
478 struct fs
*fs
= dp
->i_fs
;
480 cg
= itog(fs
, dp
->i_number
);
481 mininode
= fs
->fs_ipg
>> 2;
482 minbpg
= fs
->fs_maxbpg
>> 2;
483 if (ufs_close_dirs
&&
484 (fs
->fs_cs(fs
, cg
).cs_nifree
> mininode
) &&
485 (fs
->fs_cs(fs
, cg
).cs_nbfree
> minbpg
)) {
486 return (dp
->i_number
);
489 avgifree
= fs
->fs_cstotal
.cs_nifree
/ fs
->fs_ncg
;
490 minndir
= fs
->fs_ipg
;
492 for (cg
= 0; cg
< fs
->fs_ncg
; cg
++) {
493 ifree
= fs
->fs_cs(fs
, cg
).cs_nifree
;
494 if (ufs_close_dirs
&&
495 (ifree
> mininode
) &&
496 (fs
->fs_cs(fs
, cg
).cs_nbfree
> minbpg
)) {
497 return ((ino_t
)(fs
->fs_ipg
* cg
));
499 if ((fs
->fs_cs(fs
, cg
).cs_ndir
< minndir
) &&
500 (ifree
>= avgifree
)) {
502 minndir
= fs
->fs_cs(fs
, cg
).cs_ndir
;
505 return ((ino_t
)(fs
->fs_ipg
* mincg
));
509 * Select the desired position for the next block in a file. The file is
510 * logically divided into sections. The first section is composed of the
511 * direct blocks. Each additional section contains fs_maxbpg blocks.
513 * If no blocks have been allocated in the first section, the policy is to
514 * request a block in the same cylinder group as the inode that describes
515 * the file. If no blocks have been allocated in any other section, the
516 * policy is to place the section in a cylinder group with a greater than
517 * average number of free blocks. An appropriate cylinder group is found
518 * by using a rotor that sweeps the cylinder groups. When a new group of
519 * blocks is needed, the sweep begins in the cylinder group following the
520 * cylinder group from which the previous allocation was made. The sweep
521 * continues until a cylinder group with greater than the average number
522 * of free blocks is found. If the allocation is for the first block in an
523 * indirect block, the information on the previous allocation is unavailable;
524 * here a best guess is made based upon the logical block number being
527 * If a section is already partially allocated, the policy is to
528 * contiguously allocate fs_maxcontig blocks. The end of one of these
529 * contiguous blocks and the beginning of the next is physically separated
530 * so that the disk head will be in transit between them for at least
531 * fs_rotdelay milliseconds. This is to allow time for the processor to
532 * schedule another I/O transfer.
535 blkpref(struct inode
*ip
, daddr_t lbn
, int indx
, daddr32_t
*bap
)
538 struct ufsvfs
*ufsvfsp
;
540 int avgbfree
, startcg
;
543 ufsvfsp
= ip
->i_ufsvfs
;
545 if (indx
% fs
->fs_maxbpg
== 0 || bap
[indx
- 1] == 0) {
547 cg
= itog(fs
, ip
->i_number
);
548 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
551 * Find a cylinder with greater than average
552 * number of unused data blocks.
554 if (indx
== 0 || bap
[indx
- 1] == 0)
555 startcg
= itog(fs
, ip
->i_number
) + lbn
/ fs
->fs_maxbpg
;
557 startcg
= dtog(fs
, bap
[indx
- 1]) + 1;
558 startcg
%= fs
->fs_ncg
;
560 mutex_enter(&ufsvfsp
->vfs_lock
);
561 avgbfree
= fs
->fs_cstotal
.cs_nbfree
/ fs
->fs_ncg
;
563 * used for computing log space for writes/truncs
565 ufsvfsp
->vfs_avgbfree
= avgbfree
;
566 for (cg
= startcg
; cg
< fs
->fs_ncg
; cg
++)
567 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
569 mutex_exit(&ufsvfsp
->vfs_lock
);
570 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
572 for (cg
= 0; cg
<= startcg
; cg
++)
573 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
575 mutex_exit(&ufsvfsp
->vfs_lock
);
576 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
578 mutex_exit(&ufsvfsp
->vfs_lock
);
582 * One or more previous blocks have been laid out. If less
583 * than fs_maxcontig previous blocks are contiguous, the
584 * next block is requested contiguously, otherwise it is
585 * requested rotationally delayed by fs_rotdelay milliseconds.
588 nextblk
= bap
[indx
- 1];
590 * Provision for fallocate to return positive
591 * blk preference based on last allocation
593 if (nextblk
< 0 && nextblk
!= UFS_HOLE
) {
594 nextblk
= (-bap
[indx
- 1]) + fs
->fs_frag
;
596 nextblk
= bap
[indx
- 1] + fs
->fs_frag
;
599 if (indx
> fs
->fs_maxcontig
&& bap
[indx
- fs
->fs_maxcontig
] +
600 blkstofrags(fs
, fs
->fs_maxcontig
) != nextblk
) {
603 if (fs
->fs_rotdelay
!= 0)
605 * Here we convert ms of delay to frags as:
606 * (frags) = (ms) * (rev/sec) * (sect/rev) /
607 * ((sect/frag) * (ms/sec))
608 * then round up to the next block.
610 nextblk
+= roundup(fs
->fs_rotdelay
* fs
->fs_rps
* fs
->fs_nsect
/
611 (NSPF(fs
) * 1000), fs
->fs_frag
);
616 * Free a block or fragment.
618 * The specified block or fragment is placed back in the
619 * free map. If a fragment is deallocated, a possible
620 * block reassembly is checked.
623 free(struct inode
*ip
, daddr_t bno
, off_t size
, int flags
)
625 struct fs
*fs
= ip
->i_fs
;
626 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
627 struct ufs_q
*delq
= &ufsvfsp
->vfs_delete
;
628 struct ufs_delq_info
*delq_info
= &ufsvfsp
->vfs_delete_info
;
636 daddr_t blkno
, cylno
, rpos
;
639 * fallocate'd files will have negative block address.
640 * So negate it again to get original block address.
642 if (bno
< 0 && (bno
% fs
->fs_frag
== 0) && bno
!= UFS_HOLE
) {
646 if ((unsigned long)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
647 (void) ufs_fault(ITOV(ip
),
648 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
649 "fs = %s\n", ip
->i_dev
, fs
->fs_bsize
,
650 (int)size
, fs
->fs_fsmnt
);
654 ASSERT(!ufs_badblock(ip
, bno
));
655 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
659 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
664 if (!(flags
& I_NOCANCEL
))
665 TRANS_CANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, bno
)), size
, flags
);
666 if (flags
& (I_DIR
|I_IBLK
|I_SHAD
|I_QUOTA
)) {
667 TRANS_MATA_FREE(ufsvfsp
, ldbtob(fsbtodb(fs
, bno
)), size
);
669 blksfree
= cg_blksfree(cgp
);
670 blktot
= cg_blktot(cgp
);
671 mutex_enter(&ufsvfsp
->vfs_lock
);
672 cgp
->cg_time
= gethrestime_sec();
673 bno
= dtogd(fs
, bno
);
674 if (size
== fs
->fs_bsize
) {
675 blkno
= fragstoblks(fs
, bno
);
676 cylno
= cbtocylno(fs
, bno
);
677 rpos
= cbtorpos(ufsvfsp
, bno
);
678 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
679 if (!isclrblock(fs
, blksfree
, blkno
)) {
680 mutex_exit(&ufsvfsp
->vfs_lock
);
682 (void) ufs_fault(ITOV(ip
), "free: freeing free block, "
683 "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
684 ip
->i_dev
, bno
, ip
->i_number
, fs
->fs_fsmnt
);
687 setblock(fs
, blksfree
, blkno
);
690 cgp
->cg_cs
.cs_nbfree
++; /* Log below */
691 fs
->fs_cstotal
.cs_nbfree
++;
692 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
693 if (TRANS_ISTRANS(ufsvfsp
) && (flags
& I_ACCT
)) {
694 mutex_enter(&delq
->uq_mutex
);
695 delq_info
->delq_unreclaimed_blocks
-=
697 mutex_exit(&delq
->uq_mutex
);
700 bbase
= bno
- fragnum(fs
, bno
);
702 * Decrement the counts associated with the old frags
704 bmap
= blkmap(fs
, blksfree
, bbase
);
705 fragacct(fs
, bmap
, cgp
->cg_frsum
, -1);
707 * Deallocate the fragment
709 for (i
= 0; i
< numfrags(fs
, size
); i
++) {
710 if (isset(blksfree
, bno
+ i
)) {
712 mutex_exit(&ufsvfsp
->vfs_lock
);
713 (void) ufs_fault(ITOV(ip
),
714 "free: freeing free frag, "
715 "dev:0x%lx, blk:%ld, cg:%d, "
724 setbit(blksfree
, bno
+ i
);
726 cgp
->cg_cs
.cs_nffree
+= i
;
727 fs
->fs_cstotal
.cs_nffree
+= i
;
728 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
729 if (TRANS_ISTRANS(ufsvfsp
) && (flags
& I_ACCT
)) {
730 mutex_enter(&delq
->uq_mutex
);
731 delq_info
->delq_unreclaimed_blocks
-=
732 btodb(i
* fs
->fs_fsize
);
733 mutex_exit(&delq
->uq_mutex
);
736 * Add back in counts associated with the new frags
738 bmap
= blkmap(fs
, blksfree
, bbase
);
739 fragacct(fs
, bmap
, cgp
->cg_frsum
, 1);
741 * If a complete block has been reassembled, account for it
743 blkno
= fragstoblks(fs
, bbase
);
744 if (isblock(fs
, blksfree
, blkno
)) {
745 cylno
= cbtocylno(fs
, bbase
);
746 rpos
= cbtorpos(ufsvfsp
, bbase
);
747 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
750 cgp
->cg_cs
.cs_nffree
-= fs
->fs_frag
;
751 fs
->fs_cstotal
.cs_nffree
-= fs
->fs_frag
;
752 fs
->fs_cs(fs
, cg
).cs_nffree
-= fs
->fs_frag
;
753 cgp
->cg_cs
.cs_nbfree
++;
754 fs
->fs_cstotal
.cs_nbfree
++;
755 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
759 ufs_notclean(ufsvfsp
);
760 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
761 TRANS_SI(ufsvfsp
, fs
, cg
);
768 * The specified inode is placed back in the free map.
771 ufs_ifree(struct inode
*ip
, ino_t ino
, mode_t mode
)
773 struct fs
*fs
= ip
->i_fs
;
774 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
781 if (ip
->i_number
== ino
&& ip
->i_mode
!= 0) {
782 (void) ufs_fault(ITOV(ip
),
783 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
785 ip
->i_mode
, mode
, (int)ip
->i_number
, fs
->fs_fsmnt
);
788 if (ino
>= fs
->fs_ipg
* fs
->fs_ncg
) {
789 (void) ufs_fault(ITOV(ip
),
790 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
791 (int)ip
->i_dev
, (int)ino
, fs
->fs_fsmnt
);
794 cg
= (int)itog(fs
, ino
);
795 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
799 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
803 mutex_enter(&ufsvfsp
->vfs_lock
);
804 cgp
->cg_time
= gethrestime_sec();
805 iused
= cg_inosused(cgp
);
806 inot
= (unsigned int)(ino
% (ulong_t
)fs
->fs_ipg
);
807 if (isclr(iused
, inot
)) {
808 mutex_exit(&ufsvfsp
->vfs_lock
);
810 (void) ufs_fault(ITOV(ip
), "ufs_ifree: freeing free inode, "
811 "mode: (imode) %o, (omode) %o, ino:%d, "
813 ip
->i_mode
, mode
, (int)ino
, fs
->fs_fsmnt
);
818 if (inot
< (ulong_t
)cgp
->cg_irotor
)
819 cgp
->cg_irotor
= inot
;
820 cgp
->cg_cs
.cs_nifree
++;
821 fs
->fs_cstotal
.cs_nifree
++;
822 fs
->fs_cs(fs
, cg
).cs_nifree
++;
823 if (((mode
& IFMT
) == IFDIR
) || ((mode
& IFMT
) == IFATTRDIR
)) {
824 cgp
->cg_cs
.cs_ndir
--;
825 fs
->fs_cstotal
.cs_ndir
--;
826 fs
->fs_cs(fs
, cg
).cs_ndir
--;
829 ufs_notclean(ufsvfsp
);
830 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
831 TRANS_SI(ufsvfsp
, fs
, cg
);
836 * Implement the cylinder overflow algorithm.
838 * The policy implemented by this algorithm is:
839 * 1) allocate the block in its requested cylinder group.
840 * 2) quadratically rehash on the cylinder group number.
841 * 3) brute force search for a free block.
842 * The size parameter means size for data blocks, mode for inodes.
845 hashalloc(struct inode
*ip
, int cg
, long pref
, int size
, ulong_t (*allocator
)())
854 * 1: preferred cylinder group
856 result
= (*allocator
)(ip
, cg
, pref
, size
);
860 * 2: quadratic rehash
862 for (i
= 1; i
< fs
->fs_ncg
; i
*= 2) {
864 if (cg
>= fs
->fs_ncg
)
866 result
= (*allocator
)(ip
, cg
, 0, size
);
871 * 3: brute force search
872 * Note that we start at i == 2, since 0 was checked initially,
873 * and 1 is always checked in the quadratic rehash.
875 cg
= (icg
+ 2) % fs
->fs_ncg
;
876 for (i
= 2; i
< fs
->fs_ncg
; i
++) {
877 result
= (*allocator
)(ip
, cg
, 0, size
);
881 if (cg
== fs
->fs_ncg
)
888 * Determine whether a fragment can be extended.
890 * Check to see if the necessary fragments are available, and
891 * if they are, allocate them.
894 fragextend(struct inode
*ip
, int cg
, long bprev
, int osize
, int nsize
)
896 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
897 struct fs
*fs
= ip
->i_fs
;
905 if (fs
->fs_cs(fs
, cg
).cs_nffree
< numfrags(fs
, nsize
- osize
))
907 frags
= numfrags(fs
, nsize
);
908 bbase
= (int)fragnum(fs
, bprev
);
909 if (bbase
> fragnum(fs
, (bprev
+ frags
- 1))) {
910 /* cannot extend across a block boundary */
914 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
917 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
922 blksfree
= cg_blksfree(cgp
);
923 mutex_enter(&ufsvfsp
->vfs_lock
);
924 bno
= dtogd(fs
, bprev
);
925 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
926 if (isclr(blksfree
, bno
+ i
)) {
927 mutex_exit(&ufsvfsp
->vfs_lock
);
931 if ((TRANS_ISCANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, bprev
+ i
)),
933 mutex_exit(&ufsvfsp
->vfs_lock
);
939 cgp
->cg_time
= gethrestime_sec();
941 * The current fragment can be extended,
942 * deduct the count on fragment being extended into
943 * increase the count on the remaining fragment (if any)
944 * allocate the extended piece.
946 for (i
= frags
; i
< fs
->fs_frag
- bbase
; i
++)
947 if (isclr(blksfree
, bno
+ i
))
949 j
= i
- numfrags(fs
, osize
);
951 ASSERT(cgp
->cg_frsum
[j
] >= 0);
953 cgp
->cg_frsum
[i
- frags
]++;
954 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
955 clrbit(blksfree
, bno
+ i
);
956 cgp
->cg_cs
.cs_nffree
--;
957 fs
->fs_cs(fs
, cg
).cs_nffree
--;
958 fs
->fs_cstotal
.cs_nffree
--;
961 ufs_notclean(ufsvfsp
);
962 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
963 TRANS_SI(ufsvfsp
, fs
, cg
);
965 return ((daddr_t
)bprev
);
969 * Determine whether a block can be allocated.
971 * Check to see if a block of the apprpriate size
972 * is available, and if it is, allocate it.
975 alloccg(struct inode
*ip
, int cg
, daddr_t bpref
, int size
)
977 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
978 struct fs
*fs
= ip
->i_fs
;
987 * Searching for space could be time expensive so do some
988 * up front checking to verify that there is actually space
989 * available (free blocks or free frags).
991 if (fs
->fs_cs(fs
, cg
).cs_nbfree
== 0) {
992 if (size
== fs
->fs_bsize
)
996 * If there are not enough free frags then return.
998 if (fs
->fs_cs(fs
, cg
).cs_nffree
< numfrags(fs
, size
))
1002 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1003 (int)fs
->fs_cgsize
);
1005 cgp
= bp
->b_un
.b_cg
;
1006 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
) ||
1007 (cgp
->cg_cs
.cs_nbfree
== 0 && size
== fs
->fs_bsize
)) {
1011 blksfree
= cg_blksfree(cgp
);
1012 mutex_enter(&ufsvfsp
->vfs_lock
);
1013 cgp
->cg_time
= gethrestime_sec();
1014 if (size
== fs
->fs_bsize
) {
1015 if ((bno
= alloccgblk(ufsvfsp
, cgp
, bpref
, bp
)) == 0)
1018 ufs_notclean(ufsvfsp
);
1019 TRANS_SI(ufsvfsp
, fs
, cg
);
1024 * Check fragment bitmap to see if any fragments are already available.
1025 * mapsearch() may fail because the fragment that fits this request
1026 * might still be on the cancel list and not available for re-use yet.
1027 * Look for a bigger sized fragment to allocate first before we have
1028 * to give up and fragment a whole new block eventually.
1030 frags
= numfrags(fs
, size
);
1033 for (; allocsiz
< fs
->fs_frag
; allocsiz
++)
1034 if (cgp
->cg_frsum
[allocsiz
] != 0)
1037 if (allocsiz
!= fs
->fs_frag
) {
1038 bno
= mapsearch(ufsvfsp
, cgp
, bpref
, allocsiz
);
1039 if (bno
< 0 && allocsiz
< (fs
->fs_frag
- 1)) {
1045 if (allocsiz
== fs
->fs_frag
|| bno
< 0) {
1047 * No fragments were available, so a block
1048 * will be allocated and hacked up.
1050 if (cgp
->cg_cs
.cs_nbfree
== 0)
1052 if ((bno
= alloccgblk(ufsvfsp
, cgp
, bpref
, bp
)) == 0)
1054 bpref
= dtogd(fs
, bno
);
1055 for (i
= frags
; i
< fs
->fs_frag
; i
++)
1056 setbit(blksfree
, bpref
+ i
);
1057 i
= fs
->fs_frag
- frags
;
1058 cgp
->cg_cs
.cs_nffree
+= i
;
1059 fs
->fs_cstotal
.cs_nffree
+= i
;
1060 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
1063 ufs_notclean(ufsvfsp
);
1064 TRANS_SI(ufsvfsp
, fs
, cg
);
1069 for (i
= 0; i
< frags
; i
++)
1070 clrbit(blksfree
, bno
+ i
);
1071 cgp
->cg_cs
.cs_nffree
-= frags
;
1072 fs
->fs_cstotal
.cs_nffree
-= frags
;
1073 fs
->fs_cs(fs
, cg
).cs_nffree
-= frags
;
1074 cgp
->cg_frsum
[allocsiz
]--;
1075 ASSERT(cgp
->cg_frsum
[allocsiz
] >= 0);
1076 if (frags
!= allocsiz
) {
1077 cgp
->cg_frsum
[allocsiz
- frags
]++;
1080 ufs_notclean(ufsvfsp
);
1081 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1082 TRANS_SI(ufsvfsp
, fs
, cg
);
1084 return (cg
* fs
->fs_fpg
+ bno
);
1086 mutex_exit(&ufsvfsp
->vfs_lock
);
1092 * Allocate a block in a cylinder group.
1094 * This algorithm implements the following policy:
1095 * 1) allocate the requested block.
1096 * 2) allocate a rotationally optimal block in the same cylinder.
1097 * 3) allocate the next available block on the block rotor for the
1098 * specified cylinder group.
1099 * Note that this routine only allocates fs_bsize blocks; these
1100 * blocks may be fragmented by the routine that allocates them.
1104 struct ufsvfs
*ufsvfsp
,
1110 int cylno
, pos
, delta
, rotbl_size
;
1115 daddr_t blkno
, rpos
, frag
;
1119 ASSERT(MUTEX_HELD(&ufsvfsp
->vfs_lock
));
1120 fs
= ufsvfsp
->vfs_fs
;
1121 blksfree
= cg_blksfree(cgp
);
1123 bpref
= cgp
->cg_rotor
;
1126 bpref
= blknum(fs
, bpref
);
1127 bpref
= dtogd(fs
, bpref
);
1129 * If the requested block is available, use it.
1131 if (isblock(fs
, blksfree
, (daddr_t
)fragstoblks(fs
, bpref
))) {
1136 * Check for a block available on the same cylinder.
1138 cylno
= cbtocylno(fs
, bpref
);
1139 if (cg_blktot(cgp
)[cylno
] == 0)
1141 if (fs
->fs_cpc
== 0) {
1143 * Block layout info is not available, so just
1144 * have to take any block in this cylinder.
1146 bpref
= howmany(fs
->fs_spc
* cylno
, NSPF(fs
));
1150 * Check the summary information to see if a block is
1151 * available in the requested cylinder starting at the
1152 * requested rotational position and proceeding around.
1154 cylbp
= cg_blks(ufsvfsp
, cgp
, cylno
);
1155 pos
= cbtorpos(ufsvfsp
, bpref
);
1156 for (i
= pos
; i
< ufsvfsp
->vfs_nrpos
; i
++)
1159 if (i
== ufsvfsp
->vfs_nrpos
)
1160 for (i
= 0; i
< pos
; i
++)
1165 * Found a rotational position, now find the actual
1166 * block. A "panic" if none is actually there.
1170 * Up to this point, "pos" has referred to the rotational
1171 * position of the desired block. From now on, it holds
1172 * the offset of the current cylinder within a cylinder
1173 * cycle. (A cylinder cycle refers to a set of cylinders
1174 * which are described by a single rotational table; the
1175 * size of the cycle is fs_cpc.)
1177 * bno is set to the block number of the first block within
1178 * the current cylinder cycle.
1181 pos
= cylno
% fs
->fs_cpc
;
1182 bno
= (cylno
- pos
) * fs
->fs_spc
/ NSPB(fs
);
1185 * The blocks within a cylinder are grouped into equivalence
1186 * classes according to their "rotational position." There
1187 * are two tables used to determine these classes.
1189 * The positional offset table (fs_postbl) has an entry for
1190 * each rotational position of each cylinder in a cylinder
1191 * cycle. This entry contains the relative block number
1192 * (counting from the start of the cylinder cycle) of the
1193 * first block in the equivalence class for that position
1194 * and that cylinder. Positions for which no blocks exist
1195 * are indicated by a -1.
1197 * The rotational delta table (fs_rotbl) has an entry for
1198 * each block in a cylinder cycle. This entry contains
1199 * the offset from that block to the next block in the
1200 * same equivalence class. The last block in the class
1201 * is indicated by a zero in the table.
1203 * The following code, then, walks through all of the blocks
1204 * in the cylinder (cylno) which we're allocating within
1205 * which are in the equivalence class for the rotational
1206 * position (i) which we're allocating within.
1209 if (fs_postbl(ufsvfsp
, pos
)[i
] == -1) {
1210 (void) ufs_fault(ufsvfsp
->vfs_root
,
1211 "alloccgblk: cyl groups corrupted, pos = %d, "
1212 "i = %d, fs = %s\n", pos
, i
, fs
->fs_fsmnt
);
1217 * There is one entry in the rotational table for each block
1218 * in the cylinder cycle. These are whole blocks, not frags.
1221 rotbl_size
= (fs
->fs_cpc
* fs
->fs_spc
) >>
1222 (fs
->fs_fragshift
+ fs
->fs_fsbtodb
);
1225 * As we start, "i" is the rotational position within which
1226 * we're searching. After the next line, it will be a block
1227 * number (relative to the start of the cylinder cycle)
1228 * within the equivalence class of that rotational position.
1231 i
= fs_postbl(ufsvfsp
, pos
)[i
];
1234 if (isblock(fs
, blksfree
, (daddr_t
)(bno
+ i
))) {
1235 bno
= blkstofrags(fs
, (bno
+ i
));
1238 delta
= fs_rotbl(fs
)[i
];
1239 if (delta
<= 0 || /* End of chain, or */
1240 delta
+ i
> rotbl_size
) /* end of table? */
1241 break; /* If so, panic. */
1244 (void) ufs_fault(ufsvfsp
->vfs_root
,
1245 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1246 "fs:%s bno: %x\n", pos
, i
, fs
->fs_fsmnt
, (int)bno
);
1251 * No blocks in the requested cylinder, so take
1252 * next available one in this cylinder group.
1254 bno
= mapsearch(ufsvfsp
, cgp
, bpref
, (int)fs
->fs_frag
);
1257 cgp
->cg_rotor
= bno
;
1259 blkno
= fragstoblks(fs
, bno
);
1260 frag
= (cgp
->cg_cgx
* fs
->fs_fpg
) + bno
;
1261 if (TRANS_ISCANCEL(ufsvfsp
, ldbtob(fsbtodb(fs
, frag
)), fs
->fs_bsize
))
1263 clrblock(fs
, blksfree
, (long)blkno
);
1265 * the other cg/sb/si fields are TRANS'ed by the caller
1267 cgp
->cg_cs
.cs_nbfree
--;
1268 fs
->fs_cstotal
.cs_nbfree
--;
1269 fs
->fs_cs(fs
, cgp
->cg_cgx
).cs_nbfree
--;
1270 cylno
= cbtocylno(fs
, bno
);
1271 blks
= cg_blks(ufsvfsp
, cgp
, cylno
);
1272 rpos
= cbtorpos(ufsvfsp
, bno
);
1273 blktot
= cg_blktot(cgp
);
1276 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1282 * Determine whether an inode can be allocated.
1284 * Check to see if an inode is available, and if it is,
1285 * allocate it using the following policy:
1286 * 1) allocate the requested inode.
1287 * 2) allocate the next available inode after the requested
1288 * inode in the specified cylinder group.
1291 ialloccg(struct inode
*ip
, int cg
, daddr_t ipref
, int mode
)
1293 struct ufsvfs
*ufsvfsp
= ip
->i_ufsvfs
;
1294 struct fs
*fs
= ip
->i_fs
;
1297 int start
, len
, loc
, map
, i
;
1300 if (fs
->fs_cs(fs
, cg
).cs_nifree
== 0)
1302 bp
= UFS_BREAD(ufsvfsp
, ip
->i_dev
, (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1303 (int)fs
->fs_cgsize
);
1305 cgp
= bp
->b_un
.b_cg
;
1306 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
) ||
1307 cgp
->cg_cs
.cs_nifree
== 0) {
1311 iused
= cg_inosused(cgp
);
1312 mutex_enter(&ufsvfsp
->vfs_lock
);
1314 * While we are waiting for the mutex, someone may have taken
1315 * the last available inode. Need to recheck.
1317 if (cgp
->cg_cs
.cs_nifree
== 0) {
1318 mutex_exit(&ufsvfsp
->vfs_lock
);
1323 cgp
->cg_time
= gethrestime_sec();
1325 ipref
%= fs
->fs_ipg
;
1326 if (isclr(iused
, ipref
))
1329 start
= cgp
->cg_irotor
/ NBBY
;
1330 len
= howmany(fs
->fs_ipg
- cgp
->cg_irotor
, NBBY
);
1331 loc
= skpc(0xff, (uint_t
)len
, &iused
[start
]);
1335 loc
= skpc(0xff, (uint_t
)len
, &iused
[0]);
1337 mutex_exit(&ufsvfsp
->vfs_lock
);
1338 (void) ufs_fault(ITOV(ip
),
1339 "ialloccg: map corrupted, cg = %d, irotor = %d, "
1340 "fs = %s\n", cg
, (int)cgp
->cg_irotor
, fs
->fs_fsmnt
);
1344 i
= start
+ len
- loc
;
1347 for (i
= 1; i
< (1 << NBBY
); i
<<= 1, ipref
++) {
1348 if ((map
& i
) == 0) {
1349 cgp
->cg_irotor
= ipref
;
1354 mutex_exit(&ufsvfsp
->vfs_lock
);
1355 (void) ufs_fault(ITOV(ip
), "ialloccg: block not in mapfs = %s",
1359 setbit(iused
, ipref
);
1360 cgp
->cg_cs
.cs_nifree
--;
1361 fs
->fs_cstotal
.cs_nifree
--;
1362 fs
->fs_cs(fs
, cg
).cs_nifree
--;
1363 if (((mode
& IFMT
) == IFDIR
) || ((mode
& IFMT
) == IFATTRDIR
)) {
1364 cgp
->cg_cs
.cs_ndir
++;
1365 fs
->fs_cstotal
.cs_ndir
++;
1366 fs
->fs_cs(fs
, cg
).cs_ndir
++;
1369 ufs_notclean(ufsvfsp
);
1370 TRANS_BUF(ufsvfsp
, 0, fs
->fs_cgsize
, bp
, DT_CG
);
1371 TRANS_SI(ufsvfsp
, fs
, cg
);
1373 return (cg
* fs
->fs_ipg
+ ipref
);
1377 * Find a block of the specified size in the specified cylinder group.
1379 * It is a panic if a request is made to find a block if none are
1383 mapsearch(struct ufsvfs
*ufsvfsp
, struct cg
*cgp
, daddr_t bpref
,
1386 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1388 int start
, len
, loc
, i
, last
, first
, secondtime
;
1389 int blk
, field
, subfield
, pos
;
1393 * ufsvfs->vfs_lock is held when calling this.
1396 * Find the fragment by searching through the
1397 * free block map for an appropriate bit pattern.
1400 start
= dtogd(fs
, bpref
) / NBBY
;
1402 start
= cgp
->cg_frotor
/ NBBY
;
1404 * the following loop performs two scans -- the first scan
1405 * searches the bottom half of the array for a match and the
1406 * second scan searches the top half of the array. The loops
1407 * have been merged just to make things difficult.
1410 last
= howmany(fs
->fs_fpg
, NBBY
);
1412 cfrag
= cgp
->cg_cgx
* fs
->fs_fpg
;
1413 while (first
< last
) {
1416 * search the array for a match
1418 loc
= scanc((unsigned)len
, (uchar_t
*)&cg_blksfree(cgp
)[first
],
1419 (uchar_t
*)fragtbl
[fs
->fs_frag
],
1420 (int)(1 << (allocsiz
- 1 + (fs
->fs_frag
% NBBY
))));
1425 bno
= (last
- loc
) * NBBY
;
1428 * Found the byte in the map, sift
1429 * through the bits to find the selected frag
1431 cgp
->cg_frotor
= bno
;
1433 for (i
= bno
+ NBBY
; bno
< i
; bno
+= fs
->fs_frag
) {
1434 blk
= blkmap(fs
, cg_blksfree(cgp
), bno
);
1436 field
= around
[allocsiz
];
1437 subfield
= inside
[allocsiz
];
1439 pos
<= fs
->fs_frag
- allocsiz
;
1441 if ((blk
& field
) == subfield
) {
1454 * success if block is *not* being converted from
1455 * metadata into userdata (harpy). If so, ignore.
1457 if (!TRANS_ISCANCEL(ufsvfsp
,
1458 ldbtob(fsbtodb(fs
, (cfrag
+bno
))),
1459 allocsiz
* fs
->fs_fsize
))
1463 * keep looking -- this block is being converted
1465 first
= (last
- loc
) + 1;
1471 * no usable matches in bottom half -- now search the top half
1475 * no usable matches in top half -- all done
1485 return ((daddr_t
)-1);
1488 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */
1489 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */
1490 #define SINGLE 0 /* single indirect block ptr */
1491 #define DOUBLE 1 /* double indirect block ptr */
1492 #define TRIPLE 2 /* triple indirect block ptr */
1495 * Acquire a write lock, and keep trying till we get it
1498 allocsp_wlockfs(struct vnode
*vp
, struct lockfs
*lf
)
1504 err
= ufs_fiolfss(vp
, lf
);
1507 } while (!LOCKFS_IS_ULOCK(lf
));
1509 lf
->lf_lock
= LOCKFS_WLOCK
;
1511 lf
->lf_comment
= NULL
;
1512 err
= ufs__fiolfs(vp
, lf
, 1, 0);
1514 if (err
== EBUSY
|| err
== EINVAL
)
1521 * Release the write lock
1524 allocsp_unlockfs(struct vnode
*vp
, struct lockfs
*lf
)
1528 lf
->lf_lock
= LOCKFS_ULOCK
;
1530 err
= ufs__fiolfs(vp
, lf
, 1, 0);
1534 struct allocsp_undo
{
1537 struct allocsp_undo
*next
;
1541 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1542 * file system. For direct blocks, the blocks are allocated from the offset
1543 * requested to the block boundary, then any full blocks are allocated,
1544 * and finally any remainder.
1545 * For indirect blocks the blocks are not initialized and are
1546 * only marked as allocated. These addresses are then stored as negative
1547 * block numbers in the inode to imply special handling. UFS has been modified
1548 * where necessary to understand this new notion.
1549 * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1552 ufs_allocsp(struct vnode
*vp
, struct flock64
*lp
, cred_t
*cr
)
1555 int berr
, err
, resv
, issync
;
1556 off_t istart
, len
; /* istart, special for idb */
1559 struct ufsvfs
*ufsvfsp
;
1560 u_offset_t resid
, i
, uoff
;
1561 daddr32_t db_undo
[NDADDR
]; /* old direct blocks */
1562 struct allocsp_undo
*ib_undo
= NULL
; /* ib undo */
1563 struct allocsp_undo
*undo
= NULL
;
1564 u_offset_t osz
; /* old file size */
1565 int chunkblks
= 0; /* # of blocks in 1 allocation */
1568 daddr_t totblks
= 0;
1569 struct ulockfs
*ulp
;
1571 int nbytes
, offsetn
;
1574 ASSERT(vp
->v_type
== VREG
);
1578 if ((ufsvfsp
= ip
->i_ufsvfs
) == NULL
) {
1583 istart
= blkroundup(fs
, (lp
->l_start
));
1584 len
= blkroundup(fs
, (lp
->l_len
));
1585 chunkblks
= blkroundup(fs
, ufsvfsp
->vfs_iotransz
) / fs
->fs_bsize
;
1586 ulp
= &ufsvfsp
->vfs_ulockfs
;
1588 if (lp
->l_start
< 0 || lp
->l_len
<= 0)
1591 /* Quickly check to make sure we have space before we proceed */
1592 if (lblkno(fs
, len
) > fs
->fs_cstotal
.cs_nbfree
) {
1593 if (TRANS_ISTRANS(ufsvfsp
)) {
1594 ufs_delete_drain_wait(ufsvfsp
, 1);
1595 if (lblkno(fs
, len
) > fs
->fs_cstotal
.cs_nbfree
)
1602 * We will keep i_rwlock locked as WRITER through out the function
1603 * since we don't want anyone else reading or writing to the inode
1604 * while we are in the middle of fallocating the file.
1606 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1608 /* Back up the direct block list, used for undo later if necessary */
1609 rw_enter(&ip
->i_contents
, RW_READER
);
1610 for (i
= 0; i
< NDADDR
; i
++)
1611 db_undo
[i
] = ip
->i_db
[i
];
1613 rw_exit(&ip
->i_contents
);
1615 /* Write lock the file system */
1616 if (err
= allocsp_wlockfs(vp
, &lf
))
1620 * Allocate any direct blocks now.
1621 * Blocks are allocated from the offset requested to the block
1622 * boundary, then any full blocks are allocated, and finally any
1625 if (lblkno(fs
, lp
->l_start
) < NDADDR
) {
1626 ufs_trans_trunc_resv(ip
, ip
->i_size
+ (NDADDR
* fs
->fs_bsize
),
1628 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_ALLOCSP
, resv
);
1630 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1631 rw_enter(&ip
->i_contents
, RW_WRITER
);
1634 while ((done_len
< lp
->l_len
) &&
1635 (lblkno(fs
, lp
->l_start
+ done_len
) < NDADDR
)) {
1636 uoff
= (offset_t
)(lp
->l_start
+ done_len
);
1637 offsetn
= (int)blkoff(fs
, uoff
);
1638 nbytes
= (int)MIN(fs
->fs_bsize
- offsetn
,
1639 lp
->l_len
- done_len
);
1641 berr
= bmap_write(ip
, uoff
, offsetn
+ nbytes
,
1642 BI_FALLOCATE
, &allocblk
, cr
);
1643 /* Yikes error, quit */
1645 TRANS_INODE(ufsvfsp
, ip
);
1646 rw_exit(&ip
->i_contents
);
1647 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1648 TRANS_END_CSYNC(ufsvfsp
, err
, issync
,
1650 err
= allocsp_unlockfs(vp
, &lf
);
1656 if ((uoff
+ nbytes
) > ip
->i_size
)
1657 ip
->i_size
= (uoff
+ nbytes
);
1662 TRANS_INODE(ufsvfsp
, ip
);
1663 rw_exit(&ip
->i_contents
);
1664 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1665 TRANS_END_CSYNC(ufsvfsp
, err
, issync
, TOP_ALLOCSP
, resv
);
1667 /* start offset for indirect allocation */
1668 istart
= (uoff
+ nbytes
);
1671 /* Break the transactions into vfs_iotransz units */
1672 ufs_trans_trunc_resv(ip
, ip
->i_size
+
1673 blkroundup(fs
, ufsvfsp
->vfs_iotransz
), &resv
, &resid
);
1674 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_ALLOCSP
, resv
);
1676 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1677 rw_enter(&ip
->i_contents
, RW_WRITER
);
1679 /* Now go about fallocating necessary indirect blocks */
1680 for (i
= istart
; i
< (lp
->l_start
+ lp
->l_len
); i
+= fs
->fs_bsize
) {
1681 berr
= bmap_write(ip
, i
, fs
->fs_bsize
, BI_FALLOCATE
,
1684 TRANS_INODE(ufsvfsp
, ip
);
1685 rw_exit(&ip
->i_contents
);
1686 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1687 TRANS_END_CSYNC(ufsvfsp
, err
, issync
,
1689 err
= allocsp_unlockfs(vp
, &lf
);
1693 /* Update the blk counter only if new block was added */
1695 /* Save undo information */
1696 undo
= kmem_alloc(sizeof (struct allocsp_undo
),
1699 undo
->blk
= allocblk
;
1700 undo
->next
= ib_undo
;
1704 if (i
>= ip
->i_size
)
1705 ip
->i_size
+= fs
->fs_bsize
;
1709 /* Being a good UFS citizen, let others get a share */
1710 if (cnt
== chunkblks
) {
1712 * If there are waiters or the fs is hard locked,
1713 * error locked, or read-only error locked,
1716 if (ULOCKFS_IS_HLOCK(ulp
) || ULOCKFS_IS_ELOCK(ulp
) ||
1717 ULOCKFS_IS_ROELOCK(ulp
)) {
1718 ip
->i_cflags
|= IFALLOCATE
;
1719 TRANS_INODE(ufsvfsp
, ip
);
1720 rw_exit(&ip
->i_contents
);
1721 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1723 TRANS_END_CSYNC(ufsvfsp
, err
, issync
,
1725 rw_exit(&ip
->i_rwlock
);
1726 (void) allocsp_unlockfs(vp
, &lf
);
1730 TRANS_INODE(ufsvfsp
, ip
);
1731 rw_exit(&ip
->i_contents
);
1732 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1734 /* End the current transaction */
1735 TRANS_END_CSYNC(ufsvfsp
, err
, issync
,
1738 if (CV_HAS_WAITERS(&ulp
->ul_cv
)) {
1739 /* Release the write lock */
1740 if (err
= allocsp_unlockfs(vp
, &lf
))
1743 /* Wake up others waiting to do operations */
1744 mutex_enter(&ulp
->ul_lock
);
1745 cv_broadcast(&ulp
->ul_cv
);
1746 mutex_exit(&ulp
->ul_lock
);
1748 /* Grab the write lock again */
1749 if (err
= allocsp_wlockfs(vp
, &lf
))
1751 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1753 /* Reserve more space in log for this file */
1754 ufs_trans_trunc_resv(ip
,
1755 ip
->i_size
+ blkroundup(fs
, ufsvfsp
->vfs_iotransz
),
1757 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_ALLOCSP
, resv
);
1759 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1760 rw_enter(&ip
->i_contents
, RW_WRITER
);
1762 cnt
= 0; /* reset cnt b/c of new transaction */
1767 ip
->i_cflags
|= IFALLOCATE
;
1769 /* If the file has grown then correct the file size */
1770 if (osz
< (lp
->l_start
+ lp
->l_len
))
1771 ip
->i_size
= (lp
->l_start
+ lp
->l_len
);
1773 /* Release locks, end log transaction and unlock fs */
1774 TRANS_INODE(ufsvfsp
, ip
);
1775 rw_exit(&ip
->i_contents
);
1776 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1778 TRANS_END_CSYNC(ufsvfsp
, err
, issync
, TOP_ALLOCSP
, resv
);
1779 err
= allocsp_unlockfs(vp
, &lf
);
1782 * @ exit label, we should no longer be holding the fs write lock, and
1783 * all logging transactions should have been ended. We still hold
1788 * File has grown larger than 2GB. Set flag
1789 * in superblock to indicate this, if it
1790 * is not already set.
1792 if ((ip
->i_size
> MAXOFF32_T
) &&
1793 !(fs
->fs_flags
& FSLARGEFILES
)) {
1794 ASSERT(ufsvfsp
->vfs_lfflags
& UFS_LARGEFILES
);
1795 mutex_enter(&ufsvfsp
->vfs_lock
);
1796 fs
->fs_flags
|= FSLARGEFILES
;
1797 ufs_sbwrite(ufsvfsp
);
1798 mutex_exit(&ufsvfsp
->vfs_lock
);
1802 * Since we couldn't allocate completely, we will undo the allocations.
1805 ufs_trans_trunc_resv(ip
, totblks
* fs
->fs_bsize
, &resv
, &resid
);
1806 TRANS_BEGIN_CSYNC(ufsvfsp
, issync
, TOP_ALLOCSP
, resv
);
1808 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
1809 rw_enter(&ip
->i_contents
, RW_WRITER
);
1812 for (i
= 0; i
< NDADDR
; i
++) {
1814 * Only free the block if they are not same, and
1815 * the old one isn't zero (the fragment was
1818 if (db_undo
[i
] != ip
->i_db
[i
] && db_undo
[i
] == 0) {
1819 free(ip
, ip
->i_db
[i
], fs
->fs_bsize
, 0);
1824 /* Undo the indirect blocks */
1825 while (ib_undo
!= NULL
) {
1827 err
= bmap_set_bn(vp
, undo
->offset
, 0);
1829 cmn_err(CE_PANIC
, "ufs_allocsp(): failed to "
1830 "undo allocation of block %ld",
1832 free(ip
, undo
->blk
, fs
->fs_bsize
, I_IBLK
);
1833 ib_undo
= undo
->next
;
1834 kmem_free(undo
, sizeof (struct allocsp_undo
));
1838 TRANS_INODE(ufsvfsp
, ip
);
1840 rw_exit(&ip
->i_contents
);
1841 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
1843 TRANS_END_CSYNC(ufsvfsp
, err
, issync
, TOP_ALLOCSP
, resv
);
1845 rw_exit(&ip
->i_rwlock
);
1850 * Don't forget to free the undo chain :)
1852 while (ib_undo
!= NULL
) {
1854 ib_undo
= undo
->next
;
1855 kmem_free(undo
, sizeof (struct allocsp_undo
));
1858 rw_exit(&ip
->i_rwlock
);
1865 * Free storage space associated with the specified inode. The portion
1866 * to be freed is specified by lp->l_start and lp->l_len (already
1867 * normalized to a "whence" of 0).
1869 * This is an experimental facility whose continued existence is not
1870 * guaranteed. Currently, we only support the special case
1871 * of l_len == 0, meaning free to end of file.
1873 * Blocks are freed in reverse order. This FILO algorithm will tend to
1874 * maintain a contiguous free list much longer than FIFO.
1875 * See also ufs_itrunc() in ufs_inode.c.
1877 * Bug: unused bytes in the last retained block are not cleared.
1878 * This may result in a "hole" in the file that does not read as zeroes.
1882 ufs_freesp(struct vnode
*vp
, struct flock64
*lp
, int flag
, cred_t
*cr
)
1885 struct inode
*ip
= VTOI(vp
);
1888 ASSERT(vp
->v_type
== VREG
);
1889 ASSERT(lp
->l_start
>= 0); /* checked by convoff */
1894 rw_enter(&ip
->i_contents
, RW_READER
);
1895 if (ip
->i_size
== (u_offset_t
)lp
->l_start
) {
1896 rw_exit(&ip
->i_contents
);
1901 * Check if there is any active mandatory lock on the
1902 * range that will be truncated/expanded.
1904 if (MANDLOCK(vp
, ip
->i_mode
)) {
1905 offset_t save_start
;
1907 save_start
= lp
->l_start
;
1909 if (ip
->i_size
< lp
->l_start
) {
1911 * "Truncate up" case: need to make sure there
1912 * is no lock beyond current end-of-file. To
1913 * do so, we need to set l_start to the size
1914 * of the file temporarily.
1916 lp
->l_start
= ip
->i_size
;
1918 lp
->l_type
= F_WRLCK
;
1920 lp
->l_pid
= ttoproc(curthread
)->p_pid
;
1921 i
= (flag
& (FNDELAY
|FNONBLOCK
)) ? 0 : SLPFLCK
;
1922 rw_exit(&ip
->i_contents
);
1923 if ((i
= reclock(vp
, lp
, i
, 0, lp
->l_start
, NULL
)) != 0 ||
1924 lp
->l_type
!= F_UNLCK
) {
1925 return (i
? i
: EAGAIN
);
1927 rw_enter(&ip
->i_contents
, RW_READER
);
1929 lp
->l_start
= save_start
;
1933 * Make sure a write isn't in progress (allocating blocks)
1934 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1935 * truncate while it was allocating blocks).
1936 * Grab the locks in the right order.
1938 rw_exit(&ip
->i_contents
);
1939 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1940 error
= TRANS_ITRUNC(ip
, (u_offset_t
)lp
->l_start
, 0, cr
);
1941 rw_exit(&ip
->i_rwlock
);
1946 * Find a cg with as close to nb contiguous bytes as possible
1947 * THIS MAY TAKE MANY DISK READS!
1949 * Implemented in an attempt to allocate contiguous blocks for
1950 * writing the ufs log file to, minimizing future disk head seeking
1953 contigpref(ufsvfs_t
*ufsvfsp
, size_t nb
, size_t minb
)
1955 struct fs
*fs
= ufsvfsp
->vfs_fs
;
1956 daddr_t nblk
= lblkno(fs
, blkroundup(fs
, nb
));
1957 daddr_t minblk
= lblkno(fs
, blkroundup(fs
, minb
));
1958 daddr_t savebno
, curbno
, cgbno
;
1959 int cg
, cgblks
, savecg
, savenblk
, curnblk
, startcg
;
1968 if ((startcg
= findlogstartcg(fs
, nblk
, minblk
)) == -1)
1969 cg
= 0; /* Nothing suitable found */
1973 for (; cg
< fs
->fs_ncg
; ++cg
) {
1975 * find the largest contiguous range in this cg
1977 bp
= UFS_BREAD(ufsvfsp
, ufsvfsp
->vfs_dev
,
1978 (daddr_t
)fsbtodb(fs
, cgtod(fs
, cg
)),
1979 (int)fs
->fs_cgsize
);
1980 cgp
= bp
->b_un
.b_cg
;
1981 if (bp
->b_flags
& B_ERROR
|| !cg_chkmagic(cgp
)) {
1985 blksfree
= cg_blksfree(cgp
); /* free array */
1986 cgblks
= fragstoblks(fs
, fs
->fs_fpg
); /* blks in free array */
1988 while (cgbno
< cgblks
&& savenblk
< nblk
) {
1989 /* find a free block */
1990 for (; cgbno
< cgblks
; ++cgbno
) {
1991 if (isblock(fs
, blksfree
, cgbno
)) {
1992 if (startcg
!= -1) {
2002 /* count the number of free blocks */
2003 for (curnblk
= 0; cgbno
< cgblks
; ++cgbno
) {
2004 if (!isblock(fs
, blksfree
, cgbno
))
2006 if (++curnblk
>= nblk
)
2009 if (curnblk
> savenblk
) {
2016 if (savenblk
>= nblk
)
2022 /* convert block offset in cg to frag offset in cg */
2023 savebno
= blkstofrags(fs
, savebno
);
2025 /* convert frag offset in cg to frag offset in fs */
2026 savebno
+= (savecg
* fs
->fs_fpg
);
2032 * The object of this routine is to find a start point for the UFS log.
2033 * Ideally the space should be allocated from the smallest possible number
2034 * of contiguous cylinder groups. This is found by using a sliding window
2035 * technique. The smallest window of contiguous cylinder groups, which is
2036 * still able to accommodate the target, is found by moving the window
2037 * through the cylinder groups in a single pass. The end of the window is
2038 * advanced until the space is accommodated, then the start is advanced until
2039 * it no longer fits, the end is then advanced again and so on until the
2040 * final cylinder group is reached. The first suitable instance is recorded
2041 * and its starting cg number is returned.
2043 * If we are not able to find a minimum amount of space, represented by
2044 * minblk, or to do so uses more than the available extents, then return -1.
2048 findlogstartcg(struct fs
*fs
, daddr_t requested
, daddr_t minblk
)
2050 int ncgs
; /* number of cylinder groups */
2051 daddr_t target
; /* amount of space sought */
2052 int cwidth
, ctotal
; /* current window width and total */
2053 int bwidth
, btotal
; /* best window width and total so far */
2054 int s
; /* index of the first element in the current window */
2055 int e
; /* index of the first element + the width */
2056 /* (i.e. 1 + index of last element) */
2057 int bs
; /* index of the first element in the best window so far */
2058 int header
, max_extents
;
2063 header
= sizeof (extent_block_t
) - sizeof (extent_t
);
2064 max_extents
= ((fs
->fs_bsize
)-header
) / sizeof (extent_t
);
2065 cwidth
= ctotal
= 0;
2070 /* Advance the end of the window until it accommodates the target. */
2071 while (ctotal
< target
&& e
< ncgs
) {
2072 ctotal
+= fs
->fs_cs(fs
, e
).cs_nbfree
;
2077 * Advance the start of the window until it no longer
2078 * accommodates the target.
2080 while (ctotal
>= target
&& s
< e
) {
2081 /* See if this is the smallest window so far. */
2083 if (cwidth
<= bwidth
) {
2084 if (cwidth
== bwidth
&& ctotal
<= btotal
)
2091 ctotal
-= fs
->fs_cs(fs
, s
).cs_nbfree
;
2097 * If we cannot allocate the minimum required or we use too many
2098 * extents to do so, return -1.
2100 if (btotal
< minblk
|| bwidth
> max_extents
)