1 /* $NetBSD: ulfs_readwrite.c,v 1.19 2015/07/24 06:59:32 dholland Exp $ */
2 /* from NetBSD: ufs_readwrite.c,v 1.105 2013/01/22 09:39:18 dholland Exp */
6 * The Regents of the University of California. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.19 2015/07/24 06:59:32 dholland Exp $");
42 #define READ_S "lfs_read"
43 #define WRITE lfs_write
44 #define WRITE_S "lfs_write"
45 #define BUFRD lfs_bufrd
46 #define BUFWR lfs_bufwr
47 #define fs_sb_getbsize(fs) lfs_sb_getbsize(fs)
48 #define fs_bmask lfs_bmask
53 #define READ_S "ffs_read"
54 #define WRITE ffs_write
55 #define WRITE_S "ffs_write"
56 #define BUFRD ffs_bufrd
57 #define BUFWR ffs_bufwr
58 #define fs_sb_getbsize(fs) (fs)->fs_bsize
61 static int ulfs_post_read_update(struct vnode
*, int, int);
62 static int ulfs_post_write_update(struct vnode
*, struct uio
*, int,
63 kauth_cred_t
, off_t
, int, int, int);
66 * Vnode op for reading.
72 struct vop_read_args
/* {
83 int error
, ioflag
, advice
;
89 ioflag
= ap
->a_ioflag
;
92 KASSERT(uio
->uio_rw
== UIO_READ
);
93 KASSERT(vp
->v_type
== VREG
|| vp
->v_type
== VDIR
);
95 /* XXX Eliminate me by refusing directory reads from userland. */
96 if (vp
->v_type
== VDIR
)
97 return BUFRD(vp
, uio
, ioflag
, ap
->a_cred
);
99 /* XXX Eliminate me by using ufs_bufio in lfs. */
100 if (vp
->v_type
== VREG
&& ip
->i_number
== LFS_IFILE_INUM
)
101 return BUFRD(vp
, uio
, ioflag
, ap
->a_cred
);
103 if ((u_int64_t
)uio
->uio_offset
> fs
->um_maxfilesize
)
105 if (uio
->uio_resid
== 0)
108 #ifndef LFS_READWRITE
109 if ((ip
->i_flags
& (SF_SNAPSHOT
| SF_SNAPINVAL
)) == SF_SNAPSHOT
)
110 return ffs_snapshot_read(vp
, uio
, ioflag
);
111 #endif /* !LFS_READWRITE */
113 fstrans_start(vp
->v_mount
, FSTRANS_SHARED
);
115 if (uio
->uio_offset
>= ip
->i_size
)
118 KASSERT(vp
->v_type
== VREG
);
119 advice
= IO_ADV_DECODE(ap
->a_ioflag
);
120 while (uio
->uio_resid
> 0) {
121 if (ioflag
& IO_DIRECT
) {
122 genfs_directio(vp
, uio
, ioflag
);
124 bytelen
= MIN(ip
->i_size
- uio
->uio_offset
, uio
->uio_resid
);
127 error
= ubc_uiomove(&vp
->v_uobj
, uio
, bytelen
, advice
,
128 UBC_READ
| UBC_PARTIALOK
| UBC_UNMAP_FLAG(vp
));
134 error
= ulfs_post_read_update(vp
, ap
->a_ioflag
, error
);
135 fstrans_done(vp
->v_mount
);
140 * UFS op for reading via the buffer cache
143 BUFRD(struct vnode
*vp
, struct uio
*uio
, int ioflag
, kauth_cred_t cred
)
148 daddr_t lbn
, nextlbn
;
150 long size
, xfersize
, blkoffset
;
153 KASSERT(VOP_ISLOCKED(vp
));
154 KASSERT(vp
->v_type
== VDIR
|| vp
->v_type
== VLNK
||
156 KASSERT(uio
->uio_rw
== UIO_READ
);
162 KASSERT(vp
->v_type
!= VLNK
|| ip
->i_size
< fs
->um_maxsymlinklen
);
163 KASSERT(vp
->v_type
!= VLNK
|| fs
->um_maxsymlinklen
!= 0 ||
164 DIP(ip
, blocks
) == 0);
165 KASSERT(vp
->v_type
!= VREG
|| vp
== fs
->lfs_ivnode
);
166 KASSERT(vp
->v_type
!= VREG
|| ip
->i_number
== LFS_IFILE_INUM
);
168 if (uio
->uio_offset
> fs
->um_maxfilesize
)
170 if (uio
->uio_resid
== 0)
173 #ifndef LFS_READWRITE
174 KASSERT(!ISSET(ip
->i_flags
, (SF_SNAPSHOT
| SF_SNAPINVAL
)));
177 fstrans_start(vp
->v_mount
, FSTRANS_SHARED
);
179 if (uio
->uio_offset
>= ip
->i_size
)
182 for (error
= 0, bp
= NULL
; uio
->uio_resid
> 0; bp
= NULL
) {
183 bytesinfile
= ip
->i_size
- uio
->uio_offset
;
184 if (bytesinfile
<= 0)
186 lbn
= lfs_lblkno(fs
, uio
->uio_offset
);
188 size
= lfs_blksize(fs
, ip
, lbn
);
189 blkoffset
= lfs_blkoff(fs
, uio
->uio_offset
);
190 xfersize
= MIN(MIN(fs_sb_getbsize(fs
) - blkoffset
, uio
->uio_resid
),
193 if (lfs_lblktosize(fs
, nextlbn
) >= ip
->i_size
)
194 error
= bread(vp
, lbn
, size
, 0, &bp
);
196 int nextsize
= lfs_blksize(fs
, ip
, nextlbn
);
197 error
= breadn(vp
, lbn
,
198 size
, &nextlbn
, &nextsize
, 1, 0, &bp
);
204 * We should only get non-zero b_resid when an I/O error
205 * has occurred, which should cause us to break above.
206 * However, if the short read did not cause an error,
207 * then we want to ensure that we do not uiomove bad
208 * or uninitialized data.
211 if (size
< xfersize
) {
216 error
= uiomove((char *)bp
->b_data
+ blkoffset
, xfersize
, uio
);
225 error
= ulfs_post_read_update(vp
, ioflag
, error
);
226 fstrans_done(vp
->v_mount
);
231 ulfs_post_read_update(struct vnode
*vp
, int ioflag
, int oerror
)
233 struct inode
*ip
= VTOI(vp
);
236 if (!(vp
->v_mount
->mnt_flag
& MNT_NOATIME
)) {
237 ip
->i_flag
|= IN_ACCESS
;
238 if ((ioflag
& IO_SYNC
) == IO_SYNC
) {
239 error
= lfs_update(vp
, NULL
, NULL
, UPDATE_WAIT
);
243 /* Read error overrides any inode update error. */
250 * Vnode op for writing.
255 struct vop_write_args
/* {
266 off_t osize
, origoff
, oldoff
, preallocoff
, endallocoff
, nsize
;
267 int blkoffset
, error
, flags
, ioflag
, resid
;
274 ioflag
= ap
->a_ioflag
;
279 KASSERT(vp
->v_size
== ip
->i_size
);
280 KASSERT(uio
->uio_rw
== UIO_WRITE
);
281 KASSERT(vp
->v_type
== VREG
);
283 if (ioflag
& IO_APPEND
)
284 uio
->uio_offset
= ip
->i_size
;
285 if ((ip
->i_flags
& APPEND
) && uio
->uio_offset
!= ip
->i_size
)
289 if (uio
->uio_offset
< 0 ||
290 (u_int64_t
)uio
->uio_offset
+ uio
->uio_resid
> fs
->um_maxfilesize
)
293 /* Disallow writes to the Ifile, even if noschg flag is removed */
294 /* XXX can this go away when the Ifile is no longer in the namespace? */
295 if (vp
== fs
->lfs_ivnode
)
298 if (uio
->uio_resid
== 0)
301 fstrans_start(vp
->v_mount
, FSTRANS_SHARED
);
303 flags
= ioflag
& IO_SYNC
? B_SYNC
: 0;
304 async
= vp
->v_mount
->mnt_flag
& MNT_ASYNC
;
305 origoff
= uio
->uio_offset
;
306 resid
= uio
->uio_resid
;
310 KASSERT(vp
->v_type
== VREG
);
314 lfs_availwait(fs
, lfs_btofsb(fs
, uio
->uio_resid
));
315 lfs_check(vp
, LFS_UNUSED_LBN
, 0);
316 #endif /* !LFS_READWRITE */
318 preallocoff
= round_page(lfs_blkroundup(fs
, MAX(osize
, uio
->uio_offset
)));
319 aflag
= ioflag
& IO_SYNC
? B_SYNC
: 0;
320 nsize
= MAX(osize
, uio
->uio_offset
+ uio
->uio_resid
);
321 endallocoff
= nsize
- lfs_blkoff(fs
, nsize
);
324 * if we're increasing the file size, deal with expanding
325 * the fragment if there is one.
328 if (nsize
> osize
&& lfs_lblkno(fs
, osize
) < ULFS_NDADDR
&&
329 lfs_lblkno(fs
, osize
) != lfs_lblkno(fs
, nsize
) &&
330 lfs_blkroundup(fs
, osize
) != osize
) {
333 eob
= lfs_blkroundup(fs
, osize
);
334 uvm_vnp_setwritesize(vp
, eob
);
335 error
= ulfs_balloc_range(vp
, osize
, eob
- osize
, cred
, aflag
);
338 if (flags
& B_SYNC
) {
339 mutex_enter(vp
->v_interlock
);
340 VOP_PUTPAGES(vp
, trunc_page(osize
& lfs_sb_getbmask(fs
)),
342 PGO_CLEANIT
| PGO_SYNCIO
);
346 while (uio
->uio_resid
> 0) {
347 int ubc_flags
= UBC_WRITE
;
348 bool overwrite
; /* if we're overwrite a whole block */
351 if (ioflag
& IO_DIRECT
) {
352 genfs_directio(vp
, uio
, ioflag
);
355 oldoff
= uio
->uio_offset
;
356 blkoffset
= lfs_blkoff(fs
, uio
->uio_offset
);
357 bytelen
= MIN(fs_sb_getbsize(fs
) - blkoffset
, uio
->uio_resid
);
363 * if we're filling in a hole, allocate the blocks now and
364 * initialize the pages first. if we're extending the file,
365 * we can safely allocate blocks without initializing pages
366 * since the new blocks will be inaccessible until the write
369 overwrite
= uio
->uio_offset
>= preallocoff
&&
370 uio
->uio_offset
< endallocoff
;
371 if (!overwrite
&& (vp
->v_vflag
& VV_MAPPED
) == 0 &&
372 lfs_blkoff(fs
, uio
->uio_offset
) == 0 &&
373 (uio
->uio_offset
& PAGE_MASK
) == 0) {
376 len
= trunc_page(bytelen
);
377 len
-= lfs_blkoff(fs
, len
);
384 newoff
= oldoff
+ bytelen
;
385 if (vp
->v_size
< newoff
) {
386 uvm_vnp_setwritesize(vp
, newoff
);
390 error
= ulfs_balloc_range(vp
, uio
->uio_offset
, bytelen
,
395 genfs_node_wrlock(vp
);
396 error
= GOP_ALLOC(vp
, uio
->uio_offset
, bytelen
,
398 genfs_node_unlock(vp
);
401 ubc_flags
|= UBC_FAULTBUSY
;
408 error
= ubc_uiomove(&vp
->v_uobj
, uio
, bytelen
,
409 IO_ADV_DECODE(ioflag
), ubc_flags
| UBC_UNMAP_FLAG(vp
));
412 * update UVM's notion of the size now that we've
413 * copied the data into the vnode's pages.
415 * we should update the size even when uiomove failed.
418 if (vp
->v_size
< newoff
) {
419 uvm_vnp_setsize(vp
, newoff
);
427 * flush what we just wrote if necessary.
428 * XXXUBC simplistic async flushing.
431 #ifndef LFS_READWRITE
432 if (!async
&& oldoff
>> 16 != uio
->uio_offset
>> 16) {
433 mutex_enter(vp
->v_interlock
);
434 error
= VOP_PUTPAGES(vp
, (oldoff
>> 16) << 16,
435 (uio
->uio_offset
>> 16) << 16,
436 PGO_CLEANIT
| PGO_LAZY
);
444 if (error
== 0 && ioflag
& IO_SYNC
) {
445 mutex_enter(vp
->v_interlock
);
446 error
= VOP_PUTPAGES(vp
, trunc_page(origoff
& lfs_sb_getbmask(fs
)),
447 round_page(lfs_blkroundup(fs
, uio
->uio_offset
)),
448 PGO_CLEANIT
| PGO_SYNCIO
);
452 error
= ulfs_post_write_update(vp
, uio
, ioflag
, cred
, osize
, resid
,
454 fstrans_done(vp
->v_mount
);
460 * UFS op for writing via the buffer cache
463 BUFWR(struct vnode
*vp
, struct uio
*uio
, int ioflag
, kauth_cred_t cred
)
470 int resid
, xfersize
, size
, blkoffset
;
475 bool need_unreserve
= false;
478 KASSERT(ISSET(ioflag
, IO_NODELOCKED
));
479 KASSERT(VOP_ISLOCKED(vp
) == LK_EXCLUSIVE
);
480 KASSERT(vp
->v_type
== VDIR
|| vp
->v_type
== VLNK
);
481 KASSERT(vp
->v_type
!= VDIR
|| ISSET(ioflag
, IO_SYNC
));
482 KASSERT(uio
->uio_rw
== UIO_WRITE
);
487 KASSERT(vp
->v_size
== ip
->i_size
);
489 if (uio
->uio_offset
< 0 ||
490 uio
->uio_resid
> fs
->um_maxfilesize
||
491 uio
->uio_offset
> (fs
->um_maxfilesize
- uio
->uio_resid
))
494 KASSERT(vp
!= fs
->lfs_ivnode
);
496 if (uio
->uio_resid
== 0)
499 fstrans_start(vp
->v_mount
, FSTRANS_SHARED
);
501 flags
= ioflag
& IO_SYNC
? B_SYNC
: 0;
502 resid
= uio
->uio_resid
;
506 KASSERT(vp
->v_type
!= VREG
);
509 lfs_availwait(fs
, lfs_btofsb(fs
, uio
->uio_resid
));
510 lfs_check(vp
, LFS_UNUSED_LBN
, 0);
511 #endif /* !LFS_READWRITE */
513 /* XXX Should never have pages cached here. */
514 KASSERT(vp
->v_uobj
.uo_npages
== 0);
515 while (uio
->uio_resid
> 0) {
516 lbn
= lfs_lblkno(fs
, uio
->uio_offset
);
517 blkoffset
= lfs_blkoff(fs
, uio
->uio_offset
);
518 xfersize
= MIN(fs_sb_getbsize(fs
) - blkoffset
, uio
->uio_resid
);
519 if (fs_sb_getbsize(fs
) > xfersize
)
525 error
= lfs_reserve(fs
, vp
, NULL
,
526 lfs_btofsb(fs
, (ULFS_NIADDR
+ 1) << lfs_sb_getbshift(fs
)));
529 need_unreserve
= true;
531 error
= lfs_balloc(vp
, uio
->uio_offset
, xfersize
, cred
, flags
,
536 if (uio
->uio_offset
+ xfersize
> ip
->i_size
) {
537 ip
->i_size
= uio
->uio_offset
+ xfersize
;
538 DIP_ASSIGN(ip
, size
, ip
->i_size
);
539 uvm_vnp_setsize(vp
, ip
->i_size
);
542 size
= lfs_blksize(fs
, ip
, lbn
) - bp
->b_resid
;
546 error
= uiomove((char *)bp
->b_data
+ blkoffset
, xfersize
, uio
);
549 * if we didn't clear the block and the uiomove failed,
550 * the buf will now contain part of some other file,
551 * so we need to invalidate it.
553 if (error
&& (flags
& B_CLRBUF
) == 0) {
554 brelse(bp
, BC_INVAL
);
558 (void)VOP_BWRITE(bp
->b_vp
, bp
);
559 lfs_reserve(fs
, vp
, NULL
,
560 -lfs_btofsb(fs
, (ULFS_NIADDR
+ 1) << lfs_sb_getbshift(fs
)));
561 need_unreserve
= false;
563 if (ioflag
& IO_SYNC
)
565 else if (xfersize
+ blkoffset
== fs
->fs_bsize
)
570 if (error
|| xfersize
== 0)
574 if (need_unreserve
) {
575 lfs_reserve(fs
, vp
, NULL
,
576 -lfs_btofsb(fs
, (ULFS_NIADDR
+ 1) << lfs_sb_getbshift(fs
)));
580 error
= ulfs_post_write_update(vp
, uio
, ioflag
, cred
, osize
, resid
,
582 fstrans_done(vp
->v_mount
);
588 ulfs_post_write_update(struct vnode
*vp
, struct uio
*uio
, int ioflag
,
589 kauth_cred_t cred
, off_t osize
, int resid
, int extended
, int oerror
)
591 struct inode
*ip
= VTOI(vp
);
594 /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
595 ip
->i_flag
|= IN_CHANGE
| IN_UPDATE
;
596 if (vp
->v_mount
->mnt_flag
& MNT_RELATIME
)
597 ip
->i_flag
|= IN_ACCESS
;
600 * If we successfully wrote any data and we are not the superuser,
601 * we clear the setuid and setgid bits as a precaution against
604 if (resid
> uio
->uio_resid
&& cred
) {
605 if (ip
->i_mode
& ISUID
) {
606 if (kauth_authorize_vnode(cred
,
607 KAUTH_VNODE_RETAIN_SUID
, vp
, NULL
, EPERM
) != 0) {
608 ip
->i_mode
&= ~ISUID
;
609 DIP_ASSIGN(ip
, mode
, ip
->i_mode
);
613 if (ip
->i_mode
& ISGID
) {
614 if (kauth_authorize_vnode(cred
,
615 KAUTH_VNODE_RETAIN_SGID
, vp
, NULL
, EPERM
) != 0) {
616 ip
->i_mode
&= ~ISGID
;
617 DIP_ASSIGN(ip
, mode
, ip
->i_mode
);
622 /* If we successfully wrote anything, notify kevent listeners. */
623 if (resid
> uio
->uio_resid
)
624 VN_KNOTE(vp
, NOTE_WRITE
| (extended
? NOTE_EXTEND
: 0));
627 * Update the size on disk: truncate back to original size on
628 * error, or reflect the new size on success.
631 (void) lfs_truncate(vp
, osize
, ioflag
& IO_SYNC
, cred
);
632 uio
->uio_offset
-= resid
- uio
->uio_resid
;
633 uio
->uio_resid
= resid
;
634 } else if (resid
> uio
->uio_resid
&& (ioflag
& IO_SYNC
) == IO_SYNC
) {
635 error
= lfs_update(vp
, NULL
, NULL
, UPDATE_WAIT
);
640 /* Make sure the vnode uvm size matches the inode file size. */
641 KASSERT(vp
->v_size
== ip
->i_size
);
643 /* Write error overrides any inode update error. */