1 /* $NetBSD: rcache.c,v 1.21 2003/09/13 10:59:50 simonb Exp $ */
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Martin J. Laubach <mjl@emsi.priv.at> and
9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
35 __RCSID("$NetBSD: rcache.c,v 1.21 2003/09/13 10:59:50 simonb Exp $");
38 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/sysctl.h>
43 #include <ufs/ufs/dinode.h>
54 /*-----------------------------------------------------------------------*/
55 #define MAXCACHEBUFS 512 /* max 512 buffers */
56 #define MAXMEMPART 6 /* max 15% of the user mem */
58 /*-----------------------------------------------------------------------*/
60 volatile size_t cd_count
;
62 volatile daddr_t blkstart
;
63 volatile daddr_t blkend
; /* start + nblksread */
64 volatile daddr_t blocksRead
;
70 #define cd_blkstart desc.blkstart
71 #define cd_blkend desc.blkend
72 #define cd_blocksRead desc.blocksRead
73 #define cd_time desc.time
74 #define cd_owner desc.owner
77 static int findlru(void);
79 static void *shareBuffer
= NULL
;
80 static union cdesc
*cheader
;
81 static union cdesc
*cdesc
;
89 static int64_t readsize
;
90 static int64_t physreadsize
;
93 #define CSIZE (nblksread << dev_bshift) /* cache buf size */
94 #define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE)
97 initcache(int cachesize
, int readblksize
)
102 /* Convert read block size in terms of filesystem block size */
103 nblksread
= howmany(readblksize
, ufsib
->ufs_bsize
);
105 /* Then, convert it in terms of device block size */
106 nblksread
<<= ufsib
->ufs_bshift
- dev_bshift
;
108 if (cachesize
== -1) { /* Compute from memory available */
110 int mib
[2] = { CTL_HW
, HW_USERMEM64
};
112 len
= sizeof(usermem
);
113 if (sysctl(mib
, 2, &usermem
, &len
, NULL
, 0) < 0) {
114 msg("sysctl(hw.usermem) failed: %s\n",
118 cachebufs
= (usermem
/ MAXMEMPART
) / CSIZE
;
119 } else { /* User specified */
120 cachebufs
= cachesize
;
123 if (cachebufs
) { /* Don't allocate if zero --> no caching */
124 if (cachebufs
> MAXCACHEBUFS
)
125 cachebufs
= MAXCACHEBUFS
;
127 sharedSize
= sizeof(union cdesc
) +
128 sizeof(union cdesc
) * cachebufs
+
131 fprintf(stderr
, "Using %d buffers (%d bytes)\n", cachebufs
,
134 shareBuffer
= mmap(NULL
, sharedSize
, PROT_READ
| PROT_WRITE
,
135 MAP_ANON
| MAP_SHARED
, -1, 0);
136 if (shareBuffer
== MAP_FAILED
) {
137 msg("can't mmap shared memory for buffer: %s\n",
141 cheader
= shareBuffer
;
142 cdesc
= (union cdesc
*) (((char *) shareBuffer
) +
143 sizeof(union cdesc
));
144 cdata
= ((char *) shareBuffer
) + sizeof(union cdesc
) +
145 sizeof(union cdesc
) * cachebufs
;
147 memset(shareBuffer
, '\0', sharedSize
);
152 * Find the cache buffer descriptor that shows the minimal access time
158 size_t minTime
= cdesc
[0].cd_time
;
161 for (i
= 0; i
< cachebufs
; i
++) {
162 if (cdesc
[i
].cd_time
< minTime
) {
164 minTime
= cdesc
[i
].cd_time
;
172 * Read data directly from disk, with smart error handling.
173 * Try to recover from hard errors by reading in sector sized pieces.
174 * Error recovery is attempted at most BREADEMAX times before seeking
175 * consent from the operator to continue.
178 static int breaderrors
= 0;
182 rawread(daddr_t blkno
, char *buf
, int size
)
188 physreadsize
+= size
;
192 if (lseek(diskfd
, ((off_t
) blkno
<< dev_bshift
), SEEK_SET
) == -1) {
193 msg("rawread: lseek fails\n");
196 if ((cnt
= read(diskfd
, buf
, size
)) == size
)
198 if (blkno
+ (size
>> dev_bshift
) > ufsib
->ufs_dsize
) {
200 * Trying to read the final fragment.
202 * NB - dump only works in TP_BSIZE blocks, hence
203 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
204 * It should be smarter about not actually trying to
205 * read more than it can get, but for the time being
206 * we punt and scale back the read only when it gets
207 * us into trouble. (mkm 9/25/83)
213 msg("read error from %s: %s: [block %lld]: count=%d\n",
214 disk
, strerror(errno
), (long long)blkno
, size
);
216 msg("short read error from %s: [block %lld]: "
217 "count=%d, got=%d\n",
218 disk
, (long long)blkno
, size
, cnt
);
220 if (++breaderrors
> BREADEMAX
) {
221 msg("More than %d block read errors from %s\n",
223 broadcast("DUMP IS AILING!\n");
224 msg("This is an unrecoverable error.\n");
225 if (!query("Do you want to attempt to continue?")) {
232 * Zero buffer, then try to read each sector of buffer separately.
234 memset(buf
, 0, size
);
235 for (i
= 0; i
< size
; i
+= dev_bsize
, buf
+= dev_bsize
, blkno
++) {
236 if (lseek(diskfd
, ((off_t
)blkno
<< dev_bshift
),
238 msg("rawread: lseek2 fails: %s!\n",
242 if ((cnt
= read(diskfd
, buf
, (int)dev_bsize
)) == dev_bsize
)
245 msg("read error from %s: %s: [sector %lld]: "
246 "count=%ld\n", disk
, strerror(errno
),
247 (long long)blkno
, dev_bsize
);
250 msg("short read error from %s: [sector %lld]: "
251 "count=%ld, got=%d\n",
252 disk
, (long long)blkno
, dev_bsize
, cnt
);
257 bread(daddr_t blkno
, char *buf
, int size
)
259 int osize
= size
, idx
;
260 daddr_t oblkno
= blkno
;
262 daddr_t numBlocks
= howmany(size
, dev_bsize
);
270 rawread(blkno
, buf
, size
);
274 if (flock(diskfd
, LOCK_EX
)) {
275 msg("flock(LOCK_EX) failed: %s\n",
277 rawread(blkno
, buf
, size
);
286 for (i
= 0; i
< cachebufs
; i
++) {
287 union cdesc
*curr
= &cdesc
[(i
+ idx
) % cachebufs
];
290 if (curr
->cd_owner
) {
291 fprintf(stderr
, "Owner is set (%d, me=%d), can"
292 "not happen.\n", curr
->cd_owner
, getpid());
296 if (curr
->cd_blkend
== 0)
299 * If we find a bit of the read in the buffers,
300 * now compute how many blocks we can copy,
301 * copy them out, adjust blkno, buf and size,
304 if (curr
->cd_blkstart
<= blkno
&&
305 blkno
< curr
->cd_blkend
) {
306 /* Number of data blocks to be copied */
307 int toCopy
= MIN(size
,
308 (curr
->cd_blkend
- blkno
) << dev_bshift
);
310 if (toCopy
<= 0 || toCopy
> CSIZE
) {
311 fprintf(stderr
, "toCopy %d !\n",
316 ((blkno
- curr
->cd_blkstart
) <<
317 dev_bshift
) < CDATA(curr
) ||
319 ((blkno
- curr
->cd_blkstart
) <<
320 dev_bshift
) > CDATA(curr
) + CSIZE
) {
321 fprintf(stderr
, "%p < %p !!!\n",
322 CDATA(curr
) + ((blkno
-
323 curr
->cd_blkstart
) << dev_bshift
),
326 "cdesc[i].cd_blkstart %lld "
327 "blkno %lld dev_bsize %ld\n",
328 (long long)curr
->cd_blkstart
,
334 memcpy(buf
, CDATA(curr
) +
335 ((blkno
- curr
->cd_blkstart
) <<
341 blkno
+= howmany(toCopy
, dev_bsize
);
342 numBlocks
-= howmany(toCopy
, dev_bsize
);
344 curr
->cd_time
= cheader
->cd_count
++;
347 * If all data of a cache block have been
348 * read, chances are good no more reads
349 * will occur, so expire the cache immediately
352 curr
->cd_blocksRead
+=
353 howmany(toCopy
, dev_bsize
);
354 if (curr
->cd_blocksRead
>= nblksread
)
366 * This does actually not happen if fs blocks are not greater
369 if (numBlocks
> nblksread
|| blkno
>= ufsib
->ufs_dsize
) {
370 rawread(oblkno
, obuf
, osize
);
376 blockBlkNo
= (blkno
/ nblksread
) * nblksread
;
378 rsize
= MIN(nblksread
,
379 ufsib
->ufs_dsize
- blockBlkNo
) << dev_bshift
;
382 if (cdesc
[idx
].cd_owner
)
383 fprintf(stderr
, "Owner is set (%d, me=%d), can"
384 "not happen(2).\n", cdesc
[idx
].cd_owner
,
386 cdesc
[idx
].cd_owner
= getpid();
388 cdesc
[idx
].cd_time
= cheader
->cd_count
++;
389 cdesc
[idx
].cd_blkstart
= blockBlkNo
;
390 cdesc
[idx
].cd_blkend
= 0;
391 cdesc
[idx
].cd_blocksRead
= 0;
393 if (lseek(diskfd
, ((off_t
) blockBlkNo
<< dev_bshift
),
395 msg("readBlocks: lseek fails: %s\n",
400 CDATA(&cdesc
[idx
]), rsize
);
402 msg("readBlocks: read fails: %s\n",
407 /* On errors, panic, punt, try to read without
408 * cache and let raw read routine do the rest.
412 rawread(oblkno
, obuf
, osize
);
414 if (cdesc
[idx
].cd_owner
!= getpid())
415 fprintf(stderr
, "Owner changed from "
416 "%d to %d, can't happen\n",
417 getpid(), cdesc
[idx
].cd_owner
);
418 cdesc
[idx
].cd_owner
= 0;
423 /* On short read, just note the fact and go on */
424 cdesc
[idx
].cd_blkend
= blockBlkNo
+ rsize
/ dev_bsize
;
428 physreadsize
+= rsize
;
431 if (cdesc
[idx
].cd_owner
!= getpid())
432 fprintf(stderr
, "Owner changed from "
433 "%d to %d, can't happen\n",
434 getpid(), cdesc
[idx
].cd_owner
);
435 cdesc
[idx
].cd_owner
= 0;
438 * We swapped some of data in, let the loop fetch
444 if (flock(diskfd
, LOCK_UN
))
445 msg("flock(LOCK_UN) failed: %s\n",
450 printcachestats(void)
454 fprintf(stderr
, "Pid %d: %d reads (%u bytes) "
455 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
456 getpid(), nreads
, (u_int
) readsize
, nphysread
,
457 (u_int
) physreadsize
, (nreads
- nphysread
) * 100 / nreads
,
458 (int) (((physreadsize
- readsize
) * 100) / readsize
));