MOXA linux-2.6.x / linux-2.6.9-uc0 from sdlinux-moxaart.tgz
[linux-2.6.9-moxart.git] / fs / nfs / direct.c
blob21c3c4d396becd1c575bab082136813660d719f3
1 /*
2 * linux/fs/nfs/direct.c
4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6 * High-performance uncached I/O for the Linux NFS client
8 * There are important applications whose performance or correctness
9 * depends on uncached access to file data. Database clusters
10 * (multiple copies of the same instance running on separate hosts)
11 * implement their own cache coherency protocol that subsumes file
12 * system cache protocols. Applications that process datasets
13 * considerably larger than the client's memory do not always benefit
14 * from a local cache. A streaming video server, for instance, has no
15 * need to cache the contents of a file.
17 * When an application requests uncached I/O, all read and write requests
18 * are made directly to the server; data stored or fetched via these
19 * requests is not cached in the Linux page cache. The client does not
20 * correct unaligned requests from applications. All requested bytes are
21 * held on permanent storage before a direct write system call returns to
22 * an application.
24 * Solaris implements an uncached I/O facility called directio() that
25 * is used for backups and sequential I/O to very large files. Solaris
26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27 * an undocumented mount option.
29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30 * help from Andrew Morton.
32 * 18 Dec 2001 Initial implementation for 2.4 --cel
33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
34 * 08 Jun 2003 Port to 2.5 APIs --cel
35 * 31 Mar 2004 Handle direct I/O without VFS support --cel
39 #include <linux/config.h>
40 #include <linux/errno.h>
41 #include <linux/sched.h>
42 #include <linux/kernel.h>
43 #include <linux/smp_lock.h>
44 #include <linux/file.h>
45 #include <linux/pagemap.h>
47 #include <linux/nfs_fs.h>
48 #include <linux/nfs_page.h>
49 #include <linux/sunrpc/clnt.h>
51 #include <asm/system.h>
52 #include <asm/uaccess.h>
54 #define NFSDBG_FACILITY NFSDBG_VFS
55 #define VERF_SIZE (2 * sizeof(__u32))
56 #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
59 /**
60 * nfs_get_user_pages - find and set up pages underlying user's buffer
61 * rw: direction (read or write)
62 * user_addr: starting address of this segment of user's buffer
63 * count: size of this segment
64 * @pages: returned array of page struct pointers underlying user's buffer
66 static inline int
67 nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
68 struct page ***pages)
70 int result = -ENOMEM;
71 unsigned long page_count;
72 size_t array_size;
74 /* set an arbitrary limit to prevent arithmetic overflow */
75 if (size > MAX_DIRECTIO_SIZE)
76 return -EFBIG;
78 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
79 page_count -= user_addr >> PAGE_SHIFT;
81 array_size = (page_count * sizeof(struct page *));
82 *pages = kmalloc(array_size, GFP_KERNEL);
83 if (*pages) {
84 down_read(&current->mm->mmap_sem);
85 result = get_user_pages(current, current->mm, user_addr,
86 page_count, (rw == READ), 0,
87 *pages, NULL);
88 up_read(&current->mm->mmap_sem);
90 return result;
93 /**
94 * nfs_free_user_pages - tear down page struct array
95 * @pages: array of page struct pointers underlying target buffer
97 static void
98 nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
100 int i;
101 for (i = 0; i < npages; i++) {
102 if (do_dirty)
103 set_page_dirty_lock(pages[i]);
104 page_cache_release(pages[i]);
106 kfree(pages);
110 * nfs_direct_read_seg - Read in one iov segment. Generate separate
111 * read RPCs for each "rsize" bytes.
112 * @inode: target inode
113 * @ctx: target file open context
114 * user_addr: starting address of this segment of user's buffer
115 * count: size of this segment
116 * file_offset: offset in file to begin the operation
117 * @pages: array of addresses of page structs defining user's buffer
118 * nr_pages: size of pages array
120 static int
121 nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
122 unsigned long user_addr, size_t count, loff_t file_offset,
123 struct page **pages, int nr_pages)
125 const unsigned int rsize = NFS_SERVER(inode)->rsize;
126 int tot_bytes = 0;
127 int curpage = 0;
128 struct nfs_read_data rdata = {
129 .inode = inode,
130 .cred = ctx->cred,
131 .args = {
132 .fh = NFS_FH(inode),
133 .context = ctx,
135 .res = {
136 .fattr = &rdata.fattr,
140 rdata.args.pgbase = user_addr & ~PAGE_MASK;
141 rdata.args.offset = file_offset;
142 do {
143 int result;
145 rdata.args.count = count;
146 if (rdata.args.count > rsize)
147 rdata.args.count = rsize;
148 rdata.args.pages = &pages[curpage];
150 dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
151 rdata.args.count, (long long) rdata.args.offset,
152 user_addr + tot_bytes, rdata.args.pgbase, curpage);
154 lock_kernel();
155 result = NFS_PROTO(inode)->read(&rdata);
156 unlock_kernel();
158 if (result <= 0) {
159 if (tot_bytes > 0)
160 break;
161 if (result == -EISDIR)
162 result = -EINVAL;
163 return result;
166 tot_bytes += result;
167 if (rdata.res.eof)
168 break;
170 rdata.args.offset += result;
171 rdata.args.pgbase += result;
172 curpage += rdata.args.pgbase >> PAGE_SHIFT;
173 rdata.args.pgbase &= ~PAGE_MASK;
174 count -= result;
175 } while (count != 0);
177 /* XXX: should we zero the rest of the user's buffer if we
178 * hit eof? */
180 return tot_bytes;
184 * nfs_direct_read - For each iov segment, map the user's buffer
185 * then generate read RPCs.
186 * @inode: target inode
187 * @ctx: target file open context
188 * @iov: array of vectors that define I/O buffer
189 * file_offset: offset in file to begin the operation
190 * nr_segs: size of iovec array
192 * generic_file_direct_IO has already pushed out any non-direct
193 * writes so that this read will see them when we read from the
194 * server.
196 static ssize_t
197 nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
198 const struct iovec *iov, loff_t file_offset,
199 unsigned long nr_segs)
201 ssize_t tot_bytes = 0;
202 unsigned long seg = 0;
204 while ((seg < nr_segs) && (tot_bytes >= 0)) {
205 ssize_t result;
206 int page_count;
207 struct page **pages;
208 const struct iovec *vec = &iov[seg++];
209 unsigned long user_addr = (unsigned long) vec->iov_base;
210 size_t size = vec->iov_len;
212 page_count = nfs_get_user_pages(READ, user_addr, size, &pages);
213 if (page_count < 0) {
214 nfs_free_user_pages(pages, 0, 0);
215 if (tot_bytes > 0)
216 break;
217 return page_count;
220 result = nfs_direct_read_seg(inode, ctx, user_addr, size,
221 file_offset, pages, page_count);
223 nfs_free_user_pages(pages, page_count, 1);
225 if (result <= 0) {
226 if (tot_bytes > 0)
227 break;
228 return result;
230 tot_bytes += result;
231 file_offset += result;
232 if (result < size)
233 break;
236 return tot_bytes;
240 * nfs_direct_write_seg - Write out one iov segment. Generate separate
241 * write RPCs for each "wsize" bytes, then commit.
242 * @inode: target inode
243 * @ctx: target file open context
244 * user_addr: starting address of this segment of user's buffer
245 * count: size of this segment
246 * file_offset: offset in file to begin the operation
247 * @pages: array of addresses of page structs defining user's buffer
248 * nr_pages: size of pages array
250 static int
251 nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
252 unsigned long user_addr, size_t count, loff_t file_offset,
253 struct page **pages, int nr_pages)
255 const unsigned int wsize = NFS_SERVER(inode)->wsize;
256 size_t request;
257 int curpage, need_commit, result, tot_bytes;
258 struct nfs_writeverf first_verf;
259 struct nfs_write_data wdata = {
260 .inode = inode,
261 .cred = ctx->cred,
262 .args = {
263 .fh = NFS_FH(inode),
264 .context = ctx,
266 .res = {
267 .fattr = &wdata.fattr,
268 .verf = &wdata.verf,
272 wdata.args.stable = NFS_UNSTABLE;
273 if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
274 wdata.args.stable = NFS_FILE_SYNC;
276 nfs_begin_data_update(inode);
277 retry:
278 need_commit = 0;
279 tot_bytes = 0;
280 curpage = 0;
281 request = count;
282 wdata.args.pgbase = user_addr & ~PAGE_MASK;
283 wdata.args.offset = file_offset;
284 do {
285 wdata.args.count = request;
286 if (wdata.args.count > wsize)
287 wdata.args.count = wsize;
288 wdata.args.pages = &pages[curpage];
290 dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
291 wdata.args.count, (long long) wdata.args.offset,
292 user_addr + tot_bytes, wdata.args.pgbase, curpage);
294 lock_kernel();
295 result = NFS_PROTO(inode)->write(&wdata);
296 unlock_kernel();
298 if (result <= 0) {
299 if (tot_bytes > 0)
300 break;
301 goto out;
304 if (tot_bytes == 0)
305 memcpy(&first_verf.verifier, &wdata.verf.verifier,
306 VERF_SIZE);
307 if (wdata.verf.committed != NFS_FILE_SYNC) {
308 need_commit = 1;
309 if (memcmp(&first_verf.verifier,
310 &wdata.verf.verifier, VERF_SIZE))
311 goto sync_retry;
314 tot_bytes += result;
315 wdata.args.offset += result;
316 wdata.args.pgbase += result;
317 curpage += wdata.args.pgbase >> PAGE_SHIFT;
318 wdata.args.pgbase &= ~PAGE_MASK;
319 request -= result;
320 } while (request != 0);
323 * Commit data written so far, even in the event of an error
325 if (need_commit) {
326 wdata.args.count = tot_bytes;
327 wdata.args.offset = file_offset;
329 lock_kernel();
330 result = NFS_PROTO(inode)->commit(&wdata);
331 unlock_kernel();
333 if (result < 0 || memcmp(&first_verf.verifier,
334 &wdata.verf.verifier,
335 VERF_SIZE) != 0)
336 goto sync_retry;
338 result = tot_bytes;
340 out:
341 nfs_end_data_update_defer(inode);
343 return result;
345 sync_retry:
346 wdata.args.stable = NFS_FILE_SYNC;
347 goto retry;
351 * nfs_direct_write - For each iov segment, map the user's buffer
352 * then generate write and commit RPCs.
353 * @inode: target inode
354 * @ctx: target file open context
355 * @iov: array of vectors that define I/O buffer
356 * file_offset: offset in file to begin the operation
357 * nr_segs: size of iovec array
359 * Upon return, generic_file_direct_IO invalidates any cached pages
360 * that non-direct readers might access, so they will pick up these
361 * writes immediately.
363 static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
364 const struct iovec *iov, loff_t file_offset,
365 unsigned long nr_segs)
367 ssize_t tot_bytes = 0;
368 unsigned long seg = 0;
370 while ((seg < nr_segs) && (tot_bytes >= 0)) {
371 ssize_t result;
372 int page_count;
373 struct page **pages;
374 const struct iovec *vec = &iov[seg++];
375 unsigned long user_addr = (unsigned long) vec->iov_base;
376 size_t size = vec->iov_len;
378 page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages);
379 if (page_count < 0) {
380 nfs_free_user_pages(pages, 0, 0);
381 if (tot_bytes > 0)
382 break;
383 return page_count;
386 result = nfs_direct_write_seg(inode, ctx, user_addr, size,
387 file_offset, pages, page_count);
388 nfs_free_user_pages(pages, page_count, 0);
390 if (result <= 0) {
391 if (tot_bytes > 0)
392 break;
393 return result;
395 tot_bytes += result;
396 file_offset += result;
397 if (result < size)
398 break;
400 return tot_bytes;
404 * nfs_direct_IO - NFS address space operation for direct I/O
405 * rw: direction (read or write)
406 * @iocb: target I/O control block
407 * @iov: array of vectors that define I/O buffer
408 * file_offset: offset in file to begin the operation
409 * nr_segs: size of iovec array
412 ssize_t
413 nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
414 loff_t file_offset, unsigned long nr_segs)
416 ssize_t result = -EINVAL;
417 struct file *file = iocb->ki_filp;
418 struct nfs_open_context *ctx;
419 struct dentry *dentry = file->f_dentry;
420 struct inode *inode = dentry->d_inode;
423 * No support for async yet
425 if (!is_sync_kiocb(iocb))
426 return result;
428 ctx = (struct nfs_open_context *)file->private_data;
429 switch (rw) {
430 case READ:
431 dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
432 dentry->d_name.name, file_offset, nr_segs);
434 result = nfs_direct_read(inode, ctx, iov,
435 file_offset, nr_segs);
436 break;
437 case WRITE:
438 dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
439 dentry->d_name.name, file_offset, nr_segs);
441 result = nfs_direct_write(inode, ctx, iov,
442 file_offset, nr_segs);
443 break;
444 default:
445 break;
447 return result;
451 * nfs_file_direct_read - file direct read operation for NFS files
452 * @iocb: target I/O control block
453 * @buf: user's buffer into which to read data
454 * count: number of bytes to read
455 * pos: byte offset in file where reading starts
457 * We use this function for direct reads instead of calling
458 * generic_file_aio_read() in order to avoid gfar's check to see if
459 * the request starts before the end of the file. For that check
460 * to work, we must generate a GETATTR before each direct read, and
461 * even then there is a window between the GETATTR and the subsequent
462 * READ where the file size could change. So our preference is simply
463 * to do all reads the application wants, and the server will take
464 * care of managing the end of file boundary.
466 * This function also eliminates unnecessarily updating the file's
467 * atime locally, as the NFS server sets the file's atime, and this
468 * client must read the updated atime from the server back into its
469 * cache.
471 ssize_t
472 nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
474 ssize_t retval = -EINVAL;
475 loff_t *ppos = &iocb->ki_pos;
476 struct file *file = iocb->ki_filp;
477 struct nfs_open_context *ctx =
478 (struct nfs_open_context *) file->private_data;
479 struct dentry *dentry = file->f_dentry;
480 struct address_space *mapping = file->f_mapping;
481 struct inode *inode = mapping->host;
482 struct iovec iov = {
483 .iov_base = buf,
484 .iov_len = count,
487 dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
488 dentry->d_parent->d_name.name, dentry->d_name.name,
489 (unsigned long) count, (unsigned long) pos);
491 if (!is_sync_kiocb(iocb))
492 goto out;
493 if (count < 0)
494 goto out;
495 retval = -EFAULT;
496 if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len))
497 goto out;
498 retval = 0;
499 if (!count)
500 goto out;
502 if (mapping->nrpages) {
503 retval = filemap_fdatawrite(mapping);
504 if (retval == 0)
505 retval = filemap_fdatawait(mapping);
506 if (retval)
507 goto out;
510 retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
511 if (retval > 0)
512 *ppos = pos + retval;
514 out:
515 return retval;
519 * nfs_file_direct_write - file direct write operation for NFS files
520 * @iocb: target I/O control block
521 * @buf: user's buffer from which to write data
522 * count: number of bytes to write
523 * pos: byte offset in file where writing starts
525 * We use this function for direct writes instead of calling
526 * generic_file_aio_write() in order to avoid taking the inode
527 * semaphore and updating the i_size. The NFS server will set
528 * the new i_size and this client must read the updated size
529 * back into its cache. We let the server do generic write
530 * parameter checking and report problems.
532 * We also avoid an unnecessary invocation of generic_osync_inode(),
533 * as it is fairly meaningless to sync the metadata of an NFS file.
535 * We eliminate local atime updates, see direct read above.
537 * We avoid unnecessary page cache invalidations for normal cached
538 * readers of this file.
540 * Note that O_APPEND is not supported for NFS direct writes, as there
541 * is no atomic O_APPEND write facility in the NFS protocol.
543 ssize_t
544 nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
546 ssize_t retval = -EINVAL;
547 loff_t *ppos = &iocb->ki_pos;
548 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
549 struct file *file = iocb->ki_filp;
550 struct nfs_open_context *ctx =
551 (struct nfs_open_context *) file->private_data;
552 struct dentry *dentry = file->f_dentry;
553 struct address_space *mapping = file->f_mapping;
554 struct inode *inode = mapping->host;
555 struct iovec iov = {
556 .iov_base = (char __user *)buf,
557 .iov_len = count,
560 dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
561 dentry->d_parent->d_name.name, dentry->d_name.name,
562 inode->i_ino, (unsigned long) count, (unsigned long) pos);
564 if (!is_sync_kiocb(iocb))
565 goto out;
566 if (count < 0)
567 goto out;
568 if (pos < 0)
569 goto out;
570 retval = -EFAULT;
571 if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
572 goto out;
573 if (file->f_error) {
574 retval = file->f_error;
575 file->f_error = 0;
576 goto out;
578 retval = -EFBIG;
579 if (limit != RLIM_INFINITY) {
580 if (pos >= limit) {
581 send_sig(SIGXFSZ, current, 0);
582 goto out;
584 if (count > limit - (unsigned long) pos)
585 count = limit - (unsigned long) pos;
587 retval = 0;
588 if (!count)
589 goto out;
591 if (mapping->nrpages) {
592 retval = filemap_fdatawrite(mapping);
593 if (retval == 0)
594 retval = filemap_fdatawait(mapping);
595 if (retval)
596 goto out;
599 retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
600 if (mapping->nrpages)
601 invalidate_inode_pages2(mapping);
602 if (retval > 0)
603 *ppos = pos + retval;
605 out:
606 return retval;