1 /* $NetBSD: dev-io.c,v 1.5 2009/12/02 00:58:03 haad Exp $ */
4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
7 * This file is part of LVM2.
9 * This copyrighted material is made available to anyone wishing to use,
10 * modify, copy, or redistribute it subject to the terms and conditions
11 * of the GNU Lesser General Public License v.2.1.
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this program; if not, write to the Free Software Foundation,
15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include "lvm-types.h"
30 #include <sys/ioctl.h>
33 # define u64 uint64_t /* Missing without __KERNEL__ */
34 # undef WNOHANG /* Avoid redefinition */
35 # undef WUNTRACED /* Avoid redefinition */
36 # include <linux/fs.h> /* For block ioctl definitions */
37 # define BLKSIZE_SHIFT SECTOR_SHIFT
38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */
39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 # endif /* BLKGETSIZE64 */
42 # include <sys/disk.h>
43 # include <sys/disklabel.h>
44 # include <sys/param.h>
46 # include <sys/disk.h>
47 # define BLKBSZGET DKIOCGETBLOCKSIZE
48 # define BLKSSZGET DKIOCGETBLOCKSIZE
49 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
50 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE
51 # define BLKSIZE_SHIFT 0
54 #ifdef O_DIRECT_SUPPORT
56 # error O_DIRECT support configured but O_DIRECT definition not found in headers
60 static DM_LIST_INIT(_open_devices
);
62 /*-----------------------------------------------------------------
63 * The standard io loop that keeps submitting an io until it's
65 *---------------------------------------------------------------*/
66 static int _io(struct device_area
*where
, void *buffer
, int should_write
)
68 int fd
= dev_fd(where
->dev
);
73 log_error("Attempt to read an unopened device (%s).",
74 dev_name(where
->dev
));
79 * Skip all writes in test mode.
81 if (should_write
&& test_mode())
84 if (where
->size
> SSIZE_MAX
) {
85 log_error("Read size too large: %" PRIu64
, where
->size
);
89 if (lseek(fd
, (off_t
) where
->start
, SEEK_SET
) < 0) {
90 log_error("%s: lseek %" PRIu64
" failed: %s",
91 dev_name(where
->dev
), (uint64_t) where
->start
,
96 while (total
< (size_t) where
->size
) {
99 write(fd
, buffer
, (size_t) where
->size
- total
) :
100 read(fd
, buffer
, (size_t) where
->size
- total
);
101 while ((n
< 0) && ((errno
== EINTR
) || (errno
== EAGAIN
)));
104 log_error("%s: %s failed after %" PRIu64
" of %" PRIu64
105 " at %" PRIu64
": %s", dev_name(where
->dev
),
106 should_write
? "write" : "read",
108 (uint64_t) where
->size
,
109 (uint64_t) where
->start
, strerror(errno
));
118 return (total
== (size_t) where
->size
);
121 /*-----------------------------------------------------------------
122 * LVM2 uses O_DIRECT when performing metadata io, which requires
123 * block size aligned accesses. If any io is not aligned we have
124 * to perform the io via a bounce buffer, obviously this is quite
126 *---------------------------------------------------------------*/
129 * Get the sector size from an _open_ device.
131 static int _get_block_size(struct device
*dev
, unsigned int *size
)
133 const char *name
= dev_name(dev
);
135 struct disklabel lab
;
138 if ((dev
->block_size
== -1)) {
140 if (ioctl(dev_fd(dev
), DIOCGDINFO
, &lab
) < 0) {
141 dev
->block_size
= DEV_BSIZE
;
143 dev
->block_size
= lab
.d_secsize
;
145 if (ioctl(dev_fd(dev
), BLKBSZGET
, &dev
->block_size
) < 0) {
146 log_sys_error("ioctl BLKBSZGET", name
);
150 log_debug("%s: block size is %u bytes", name
, dev
->block_size
);
153 *size
= (unsigned int) dev
->block_size
;
159 * Widens a region to be an aligned region.
161 static void _widen_region(unsigned int block_size
, struct device_area
*region
,
162 struct device_area
*result
)
164 uint64_t mask
= block_size
- 1, delta
;
165 memcpy(result
, region
, sizeof(*result
));
167 /* adjust the start */
168 delta
= result
->start
& mask
;
170 result
->start
-= delta
;
171 result
->size
+= delta
;
175 delta
= (result
->start
+ result
->size
) & mask
;
177 result
->size
+= block_size
- delta
;
180 static int _aligned_io(struct device_area
*where
, void *buffer
,
184 unsigned int block_size
= 0;
186 struct device_area widened
;
188 if (!(where
->dev
->flags
& DEV_REGULAR
) &&
189 !_get_block_size(where
->dev
, &block_size
))
193 block_size
= lvm_getpagesize();
195 _widen_region(block_size
, where
, &widened
);
197 /* Do we need to use a bounce buffer? */
198 mask
= block_size
- 1;
199 if (!memcmp(where
, &widened
, sizeof(widened
)) &&
200 !((uintptr_t) buffer
& mask
))
201 return _io(where
, buffer
, should_write
);
203 /* Allocate a bounce buffer with an extra block */
204 if (!(bounce
= alloca((size_t) widened
.size
+ block_size
))) {
205 log_error("Bounce buffer alloca failed");
210 * Realign start of bounce buffer (using the extra sector)
212 if (((uintptr_t) bounce
) & mask
)
213 bounce
= (void *) ((((uintptr_t) bounce
) + mask
) & ~mask
);
215 /* channel the io through the bounce buffer */
216 if (!_io(&widened
, bounce
, 0)) {
219 /* FIXME pre-extend the file */
220 memset(bounce
, '\n', widened
.size
);
224 memcpy(bounce
+ (where
->start
- widened
.start
), buffer
,
225 (size_t) where
->size
);
227 /* ... then we write */
228 return _io(&widened
, bounce
, 1);
231 memcpy(buffer
, bounce
+ (where
->start
- widened
.start
),
232 (size_t) where
->size
);
237 static int _dev_get_size_file(const struct device
*dev
, uint64_t *size
)
239 const char *name
= dev_name(dev
);
242 if (stat(name
, &info
)) {
243 log_sys_error("stat", name
);
247 *size
= info
.st_size
;
248 *size
>>= SECTOR_SHIFT
; /* Convert to sectors */
250 log_very_verbose("%s: size is %" PRIu64
" sectors", name
, *size
);
255 static int _dev_get_size_dev(const struct device
*dev
, uint64_t *size
)
258 const char *name
= dev_name(dev
);
260 struct disklabel lab
;
261 struct dkwedge_info dkw
;
264 if ((fd
= open(name
, O_RDONLY
)) < 0) {
266 log_sys_error("open", name
);
272 if ((*size
= lseek (fd
, 0, SEEK_END
)) < 0) {
273 log_sys_error("lseek SEEK_END", name
);
278 if (ioctl(fd
, DIOCGDINFO
, &lab
) < 0) {
279 if (ioctl(fd
, DIOCGWEDGEINFO
, &dkw
) < 0) {
280 log_debug("ioctl DIOCGWEDGEINFO", name
);
285 *size
= dkw
.dkw_size
;
288 *size
/= lab
.d_secsize
;
290 if (ioctl(fd
, BLKGETSIZE64
, size
) < 0) {
291 log_sys_error("ioctl BLKGETSIZE64", name
);
293 log_sys_error("close", name
);
297 *size
>>= BLKSIZE_SHIFT
; /* Convert to sectors */
300 log_sys_error("close", name
);
302 log_very_verbose("%s: size is %" PRIu64
" sectors", name
, *size
);
307 static int _dev_read_ahead_dev(struct device
*dev
, uint32_t *read_ahead
)
310 long read_ahead_long
;
312 if (dev
->read_ahead
!= -1) {
313 *read_ahead
= (uint32_t) dev
->read_ahead
;
320 if (ioctl(dev
->fd
, BLKRAGET
, &read_ahead_long
) < 0) {
321 log_sys_error("ioctl BLKRAGET", dev_name(dev
));
330 *read_ahead
= (uint32_t) read_ahead_long
;
331 dev
->read_ahead
= read_ahead_long
;
333 log_very_verbose("%s: read_ahead is %u sectors",
334 dev_name(dev
), *read_ahead
);
339 /*-----------------------------------------------------------------
341 *---------------------------------------------------------------*/
343 int dev_get_size(const struct device
*dev
, uint64_t *size
)
348 if ((dev
->flags
& DEV_REGULAR
))
349 return _dev_get_size_file(dev
, size
);
351 return _dev_get_size_dev(dev
, size
);
354 int dev_get_read_ahead(struct device
*dev
, uint32_t *read_ahead
)
359 if (dev
->flags
& DEV_REGULAR
) {
364 return _dev_read_ahead_dev(dev
, read_ahead
);
368 int dev_get_sectsize(struct device *dev, uint32_t *size)
372 const char *name = dev_name(dev);
374 if ((fd = open(name, O_RDONLY)) < 0) {
375 log_sys_error("open", name);
379 if (ioctl(fd, BLKSSZGET, &s) < 0) {
380 log_sys_error("ioctl BLKSSZGET", name);
382 log_sys_error("close", name);
387 log_sys_error("close", name);
389 *size = (uint32_t) s;
391 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
397 void dev_flush(struct device
*dev
)
400 if (!(dev
->flags
& DEV_REGULAR
) && ioctl(dev
->fd
, BLKFLSBUF
, 0) >= 0)
404 if (fsync(dev
->fd
) >= 0)
410 int dev_open_flags(struct device
*dev
, int flags
, int direct
, int quiet
)
414 int need_excl
= 0, need_rw
= 0;
416 if ((flags
& O_ACCMODE
) == O_RDWR
)
419 if ((flags
& O_EXCL
))
423 if (((dev
->flags
& DEV_OPENED_RW
) || !need_rw
) &&
424 ((dev
->flags
& DEV_OPENED_EXCL
) || !need_excl
)) {
429 if (dev
->open_count
&& !need_excl
) {
430 /* FIXME Ensure we never get here */
431 log_debug("WARNING: %s already opened read-only",
436 dev_close_immediate(dev
);
440 log_error("WARNING: dev_open(%s) called while suspended",
443 if (dev
->flags
& DEV_REGULAR
)
444 name
= dev_name(dev
);
445 else if (!(name
= dev_name_confirmed(dev
, quiet
)))
448 if (!(dev
->flags
& DEV_REGULAR
)) {
449 if (stat(name
, &buf
) < 0) {
450 log_sys_error("%s: stat failed", name
);
453 if (buf
.st_rdev
!= dev
->dev
) {
454 log_error("%s: device changed", name
);
459 #ifdef O_DIRECT_SUPPORT
461 if (!(dev
->flags
& DEV_O_DIRECT_TESTED
))
462 dev
->flags
|= DEV_O_DIRECT
;
464 if ((dev
->flags
& DEV_O_DIRECT
))
470 /* Don't update atime on device inodes */
471 if (!(dev
->flags
& DEV_REGULAR
))
475 if ((dev
->fd
= open(name
, flags
, 0777)) < 0) {
476 #ifdef O_DIRECT_SUPPORT
477 if (direct
&& !(dev
->flags
& DEV_O_DIRECT_TESTED
)) {
479 if ((dev
->fd
= open(name
, flags
, 0777)) >= 0) {
480 dev
->flags
&= ~DEV_O_DIRECT
;
481 log_debug("%s: Not using O_DIRECT", name
);
487 log_sys_debug("open", name
);
489 log_sys_error("open", name
);
494 #ifdef O_DIRECT_SUPPORT
497 dev
->flags
|= DEV_O_DIRECT_TESTED
;
500 dev
->flags
&= ~DEV_ACCESSED_W
;
503 dev
->flags
|= DEV_OPENED_RW
;
505 dev
->flags
&= ~DEV_OPENED_RW
;
508 dev
->flags
|= DEV_OPENED_EXCL
;
510 dev
->flags
&= ~DEV_OPENED_EXCL
;
512 if (!(dev
->flags
& DEV_REGULAR
) &&
513 ((fstat(dev
->fd
, &buf
) < 0) || (buf
.st_rdev
!= dev
->dev
))) {
514 log_error("%s: fstat failed: Has device name changed?", name
);
515 dev_close_immediate(dev
);
519 #ifndef O_DIRECT_SUPPORT
520 if (!(dev
->flags
& DEV_REGULAR
))
524 if ((flags
& O_CREAT
) && !(flags
& O_TRUNC
))
525 dev
->end
= lseek(dev
->fd
, (off_t
) 0, SEEK_END
);
527 dm_list_add(&_open_devices
, &dev
->open_list
);
529 log_debug("Opened %s %s%s%s", dev_name(dev
),
530 dev
->flags
& DEV_OPENED_RW
? "RW" : "RO",
531 dev
->flags
& DEV_OPENED_EXCL
? " O_EXCL" : "",
532 dev
->flags
& DEV_O_DIRECT
? " O_DIRECT" : "");
537 int dev_open_quiet(struct device
*dev
)
541 flags
= vg_write_lock_held() ? O_RDWR
: O_RDONLY
;
543 return dev_open_flags(dev
, flags
, 1, 1);
546 int dev_open(struct device
*dev
)
550 flags
= vg_write_lock_held() ? O_RDWR
: O_RDONLY
;
552 return dev_open_flags(dev
, flags
, 1, 0);
555 int dev_test_excl(struct device
*dev
)
560 flags
= vg_write_lock_held() ? O_RDWR
: O_RDONLY
;
563 r
= dev_open_flags(dev
, flags
, 1, 1);
565 dev_close_immediate(dev
);
570 static void _close(struct device
*dev
)
573 log_sys_error("close", dev_name(dev
));
575 dev
->block_size
= -1;
576 dm_list_del(&dev
->open_list
);
578 log_debug("Closed %s", dev_name(dev
));
580 if (dev
->flags
& DEV_ALLOCED
) {
581 dm_free((void *) dm_list_item(dev
->aliases
.n
, struct str_list
)->
583 dm_free(dev
->aliases
.n
);
588 static int _dev_close(struct device
*dev
, int immediate
)
590 struct lvmcache_info
*info
;
593 log_error("Attempt to close device '%s' "
594 "which is not open.", dev_name(dev
));
598 #ifndef O_DIRECT_SUPPORT
599 if (dev
->flags
& DEV_ACCESSED_W
)
603 if (dev
->open_count
> 0)
606 if (immediate
&& dev
->open_count
)
607 log_debug("%s: Immediate close attempt while still referenced",
610 /* Close unless device is known to belong to a locked VG */
612 (dev
->open_count
< 1 &&
613 (!(info
= info_from_pvid(dev
->pvid
, 0)) ||
615 !vgname_is_locked(info
->vginfo
->vgname
))))
621 int dev_close(struct device
*dev
)
623 return _dev_close(dev
, 0);
626 int dev_close_immediate(struct device
*dev
)
628 return _dev_close(dev
, 1);
631 void dev_close_all(void)
633 struct dm_list
*doh
, *doht
;
636 dm_list_iterate_safe(doh
, doht
, &_open_devices
) {
637 dev
= dm_list_struct_base(doh
, struct device
, open_list
);
638 if (dev
->open_count
< 1)
643 int dev_read(struct device
*dev
, uint64_t offset
, size_t len
, void *buffer
)
645 struct device_area where
;
647 if (!dev
->open_count
)
651 where
.start
= offset
;
654 return _aligned_io(&where
, buffer
, 0);
658 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
659 * by (offset,len) and (offset2,len2). Thus, the total size of
660 * 'buf' should be len+len2.
662 int dev_read_circular(struct device
*dev
, uint64_t offset
, size_t len
,
663 uint64_t offset2
, size_t len2
, void *buf
)
665 if (!dev_read(dev
, offset
, len
, buf
)) {
666 log_error("Read from %s failed", dev_name(dev
));
671 * The second region is optional, and allows for
672 * a circular buffer on the device.
677 if (!dev_read(dev
, offset2
, len2
, buf
+ len
)) {
678 log_error("Circular read from %s failed",
686 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
687 * But fails if concurrent processes writing
690 /* FIXME pre-extend the file */
691 int dev_append(struct device
*dev
, size_t len
, void *buffer
)
695 if (!dev
->open_count
)
698 r
= dev_write(dev
, dev
->end
, len
, buffer
);
699 dev
->end
+= (uint64_t) len
;
701 #ifndef O_DIRECT_SUPPORT
707 int dev_write(struct device
*dev
, uint64_t offset
, size_t len
, void *buffer
)
709 struct device_area where
;
711 if (!dev
->open_count
)
715 where
.start
= offset
;
718 dev
->flags
|= DEV_ACCESSED_W
;
720 return _aligned_io(&where
, buffer
, 1);
723 int dev_set(struct device
*dev
, uint64_t offset
, size_t len
, int value
)
726 char buffer
[4096] __attribute((aligned(8)));
731 if ((offset
% SECTOR_SIZE
) || (len
% SECTOR_SIZE
))
732 log_debug("Wiping %s at %" PRIu64
" length %" PRIsize_t
,
733 dev_name(dev
), offset
, len
);
735 log_debug("Wiping %s at sector %" PRIu64
" length %" PRIsize_t
736 " sectors", dev_name(dev
), offset
>> SECTOR_SHIFT
,
737 len
>> SECTOR_SHIFT
);
739 memset(buffer
, value
, sizeof(buffer
));
741 s
= len
> sizeof(buffer
) ? sizeof(buffer
) : len
;
742 if (!dev_write(dev
, offset
, s
, buffer
))
752 dev
->flags
|= DEV_ACCESSED_W
;