On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / buffer.c
blob4f326f83c0e207ec5c5f84ee575534d4d4ca8ed1
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49 void put_bh(struct buffer_head *bh)
51 WARN_ON(atomic_read(&bh->b_count) <= 0);
52 smp_mb__before_atomic_dec();
53 atomic_dec(&bh->b_count);
55 EXPORT_SYMBOL(put_bh);
57 inline void
58 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
60 bh->b_end_io = handler;
61 bh->b_private = private;
63 EXPORT_SYMBOL(init_buffer);
65 static int sync_buffer(void *word)
67 struct block_device *bd;
68 struct buffer_head *bh
69 = container_of(word, struct buffer_head, b_state);
71 smp_mb();
72 bd = bh->b_bdev;
73 if (bd)
74 blk_run_address_space(bd->bd_inode->i_mapping);
75 io_schedule();
76 return 0;
79 void __lock_buffer(struct buffer_head *bh)
81 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
82 TASK_UNINTERRUPTIBLE);
84 EXPORT_SYMBOL(__lock_buffer);
86 void unlock_buffer(struct buffer_head *bh)
88 clear_bit_unlock(BH_Lock, &bh->b_state);
89 smp_mb__after_clear_bit();
90 wake_up_bit(&bh->b_state, BH_Lock);
92 EXPORT_SYMBOL(unlock_buffer);
95 * Block until a buffer comes unlocked. This doesn't stop it
96 * from becoming locked again - you have to lock it yourself
97 * if you want to preserve its state.
99 void __wait_on_buffer(struct buffer_head * bh)
101 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
103 EXPORT_SYMBOL(__wait_on_buffer);
105 static void
106 __clear_page_buffers(struct page *page)
108 ClearPagePrivate(page);
109 set_page_private(page, 0);
110 page_cache_release(page);
114 static int quiet_error(struct buffer_head *bh)
116 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
117 return 0;
118 return 1;
122 static void buffer_io_error(struct buffer_head *bh)
124 char b[BDEVNAME_SIZE];
125 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
126 bdevname(bh->b_bdev, b),
127 (unsigned long long)bh->b_blocknr);
131 * End-of-IO handler helper function which does not touch the bh after
132 * unlocking it.
133 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
134 * a race there is benign: unlock_buffer() only use the bh's address for
135 * hashing after unlocking the buffer, so it doesn't actually touch the bh
136 * itself.
138 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
140 if (uptodate) {
141 set_buffer_uptodate(bh);
142 } else {
143 /* This happens, due to failed READA attempts. */
144 clear_buffer_uptodate(bh);
146 unlock_buffer(bh);
150 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
151 * unlock the buffer. This is what ll_rw_block uses too.
153 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
155 __end_buffer_read_notouch(bh, uptodate);
156 put_bh(bh);
158 EXPORT_SYMBOL(end_buffer_read_sync);
160 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
162 char b[BDEVNAME_SIZE];
164 if (uptodate) {
165 set_buffer_uptodate(bh);
166 } else {
167 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
168 buffer_io_error(bh);
169 printk(KERN_WARNING "lost page write due to "
170 "I/O error on %s\n",
171 bdevname(bh->b_bdev, b));
173 set_buffer_write_io_error(bh);
174 clear_buffer_uptodate(bh);
176 unlock_buffer(bh);
177 put_bh(bh);
179 EXPORT_SYMBOL(end_buffer_write_sync);
182 * Various filesystems appear to want __find_get_block to be non-blocking.
183 * But it's the page lock which protects the buffers. To get around this,
184 * we get exclusion from try_to_free_buffers with the blockdev mapping's
185 * private_lock.
187 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
188 * may be quite high. This code could TryLock the page, and if that
189 * succeeds, there is no need to take private_lock. (But if
190 * private_lock is contended then so is mapping->tree_lock).
192 static struct buffer_head *
193 __find_get_block_slow(struct block_device *bdev, sector_t block)
195 struct inode *bd_inode = bdev->bd_inode;
196 struct address_space *bd_mapping = bd_inode->i_mapping;
197 struct buffer_head *ret = NULL;
198 pgoff_t index;
199 struct buffer_head *bh;
200 struct buffer_head *head;
201 struct page *page;
202 int all_mapped = 1;
204 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
205 page = find_get_page(bd_mapping, index);
206 if (!page)
207 goto out;
209 spin_lock(&bd_mapping->private_lock);
210 if (!page_has_buffers(page))
211 goto out_unlock;
212 head = page_buffers(page);
213 bh = head;
214 do {
215 if (!buffer_mapped(bh))
216 all_mapped = 0;
217 else if (bh->b_blocknr == block) {
218 ret = bh;
219 get_bh(bh);
220 goto out_unlock;
222 bh = bh->b_this_page;
223 } while (bh != head);
225 /* we might be here because some of the buffers on this page are
226 * not mapped. This is due to various races between
227 * file io on the block device and getblk. It gets dealt with
228 * elsewhere, don't buffer_error if we had some unmapped buffers
230 if (all_mapped) {
231 printk("__find_get_block_slow() failed. "
232 "block=%llu, b_blocknr=%llu\n",
233 (unsigned long long)block,
234 (unsigned long long)bh->b_blocknr);
235 printk("b_state=0x%08lx, b_size=%zu\n",
236 bh->b_state, bh->b_size);
237 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
239 out_unlock:
240 spin_unlock(&bd_mapping->private_lock);
241 page_cache_release(page);
242 out:
243 return ret;
246 /* If invalidate_buffers() will trash dirty buffers, it means some kind
247 of fs corruption is going on. Trashing dirty data always imply losing
248 information that was supposed to be just stored on the physical layer
249 by the user.
251 Thus invalidate_buffers in general usage is not allwowed to trash
252 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
253 be preserved. These buffers are simply skipped.
255 We also skip buffers which are still in use. For example this can
256 happen if a userspace program is reading the block device.
258 NOTE: In the case where the user removed a removable-media-disk even if
259 there's still dirty data not synced on disk (due a bug in the device driver
260 or due an error of the user), by not destroying the dirty buffers we could
261 generate corruption also on the next media inserted, thus a parameter is
262 necessary to handle this case in the most safe way possible (trying
263 to not corrupt also the new disk inserted with the data belonging to
264 the old now corrupted disk). Also for the ramdisk the natural thing
265 to do in order to release the ramdisk memory is to destroy dirty buffers.
267 These are two special cases. Normal usage imply the device driver
268 to issue a sync on the device (without waiting I/O completion) and
269 then an invalidate_buffers call that doesn't trash dirty buffers.
271 For handling cache coherency with the blkdev pagecache the 'update' case
272 is been introduced. It is needed to re-read from disk any pinned
273 buffer. NOTE: re-reading from disk is destructive so we can do it only
274 when we assume nobody is changing the buffercache under our I/O and when
275 we think the disk contains more recent information than the buffercache.
276 The update == 1 pass marks the buffers we need to update, the update == 2
277 pass does the actual I/O. */
278 void invalidate_bdev(struct block_device *bdev)
280 struct address_space *mapping = bdev->bd_inode->i_mapping;
282 if (mapping->nrpages == 0)
283 return;
285 invalidate_bh_lrus();
286 invalidate_mapping_pages(mapping, 0, -1);
288 EXPORT_SYMBOL(invalidate_bdev);
291 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
293 static void free_more_memory(void)
295 struct zone *zone;
296 int nid;
298 wakeup_flusher_threads(1024);
299 yield();
301 for_each_online_node(nid) {
302 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
303 gfp_zone(GFP_NOFS), NULL,
304 &zone);
305 if (zone)
306 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
307 GFP_NOFS, NULL);
312 * I/O completion handler for block_read_full_page() - pages
313 * which come unlocked at the end of I/O.
315 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
317 unsigned long flags;
318 struct buffer_head *first;
319 struct buffer_head *tmp;
320 struct page *page;
321 int page_uptodate = 1;
323 BUG_ON(!buffer_async_read(bh));
325 page = bh->b_page;
326 if (uptodate) {
327 set_buffer_uptodate(bh);
328 } else {
329 clear_buffer_uptodate(bh);
330 if (!quiet_error(bh))
331 buffer_io_error(bh);
332 SetPageError(page);
336 * Be _very_ careful from here on. Bad things can happen if
337 * two buffer heads end IO at almost the same time and both
338 * decide that the page is now completely done.
340 first = page_buffers(page);
341 local_irq_save(flags);
342 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
343 clear_buffer_async_read(bh);
344 unlock_buffer(bh);
345 tmp = bh;
346 do {
347 if (!buffer_uptodate(tmp))
348 page_uptodate = 0;
349 if (buffer_async_read(tmp)) {
350 BUG_ON(!buffer_locked(tmp));
351 goto still_busy;
353 tmp = tmp->b_this_page;
354 } while (tmp != bh);
355 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
356 local_irq_restore(flags);
359 * If none of the buffers had errors and they are all
360 * uptodate then we can set the page uptodate.
362 if (page_uptodate && !PageError(page))
363 SetPageUptodate(page);
364 unlock_page(page);
365 return;
367 still_busy:
368 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
369 local_irq_restore(flags);
370 return;
374 * Completion handler for block_write_full_page() - pages which are unlocked
375 * during I/O, and which have PageWriteback cleared upon I/O completion.
377 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
379 char b[BDEVNAME_SIZE];
380 unsigned long flags;
381 struct buffer_head *first;
382 struct buffer_head *tmp;
383 struct page *page;
385 BUG_ON(!buffer_async_write(bh));
387 page = bh->b_page;
388 if (uptodate) {
389 set_buffer_uptodate(bh);
390 } else {
391 if (!quiet_error(bh)) {
392 buffer_io_error(bh);
393 printk(KERN_WARNING "lost page write due to "
394 "I/O error on %s\n",
395 bdevname(bh->b_bdev, b));
397 set_bit(AS_EIO, &page->mapping->flags);
398 set_buffer_write_io_error(bh);
399 clear_buffer_uptodate(bh);
400 SetPageError(page);
403 first = page_buffers(page);
404 local_irq_save(flags);
405 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
407 clear_buffer_async_write(bh);
408 unlock_buffer(bh);
409 tmp = bh->b_this_page;
410 while (tmp != bh) {
411 if (buffer_async_write(tmp)) {
412 BUG_ON(!buffer_locked(tmp));
413 goto still_busy;
415 tmp = tmp->b_this_page;
417 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
418 local_irq_restore(flags);
419 end_page_writeback(page);
420 return;
422 still_busy:
423 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
424 local_irq_restore(flags);
425 return;
427 EXPORT_SYMBOL(end_buffer_async_write);
430 * If a page's buffers are under async readin (end_buffer_async_read
431 * completion) then there is a possibility that another thread of
432 * control could lock one of the buffers after it has completed
433 * but while some of the other buffers have not completed. This
434 * locked buffer would confuse end_buffer_async_read() into not unlocking
435 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
436 * that this buffer is not under async I/O.
438 * The page comes unlocked when it has no locked buffer_async buffers
439 * left.
441 * PageLocked prevents anyone starting new async I/O reads any of
442 * the buffers.
444 * PageWriteback is used to prevent simultaneous writeout of the same
445 * page.
447 * PageLocked prevents anyone from starting writeback of a page which is
448 * under read I/O (PageWriteback is only ever set against a locked page).
450 static void mark_buffer_async_read(struct buffer_head *bh)
452 bh->b_end_io = end_buffer_async_read;
453 set_buffer_async_read(bh);
456 static void mark_buffer_async_write_endio(struct buffer_head *bh,
457 bh_end_io_t *handler)
459 bh->b_end_io = handler;
460 set_buffer_async_write(bh);
463 void mark_buffer_async_write(struct buffer_head *bh)
465 mark_buffer_async_write_endio(bh, end_buffer_async_write);
467 EXPORT_SYMBOL(mark_buffer_async_write);
471 * fs/buffer.c contains helper functions for buffer-backed address space's
472 * fsync functions. A common requirement for buffer-based filesystems is
473 * that certain data from the backing blockdev needs to be written out for
474 * a successful fsync(). For example, ext2 indirect blocks need to be
475 * written back and waited upon before fsync() returns.
477 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
478 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
479 * management of a list of dependent buffers at ->i_mapping->private_list.
481 * Locking is a little subtle: try_to_free_buffers() will remove buffers
482 * from their controlling inode's queue when they are being freed. But
483 * try_to_free_buffers() will be operating against the *blockdev* mapping
484 * at the time, not against the S_ISREG file which depends on those buffers.
485 * So the locking for private_list is via the private_lock in the address_space
486 * which backs the buffers. Which is different from the address_space
487 * against which the buffers are listed. So for a particular address_space,
488 * mapping->private_lock does *not* protect mapping->private_list! In fact,
489 * mapping->private_list will always be protected by the backing blockdev's
490 * ->private_lock.
492 * Which introduces a requirement: all buffers on an address_space's
493 * ->private_list must be from the same address_space: the blockdev's.
495 * address_spaces which do not place buffers at ->private_list via these
496 * utility functions are free to use private_lock and private_list for
497 * whatever they want. The only requirement is that list_empty(private_list)
498 * be true at clear_inode() time.
500 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
501 * filesystems should do that. invalidate_inode_buffers() should just go
502 * BUG_ON(!list_empty).
504 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
505 * take an address_space, not an inode. And it should be called
506 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
507 * queued up.
509 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
510 * list if it is already on a list. Because if the buffer is on a list,
511 * it *must* already be on the right one. If not, the filesystem is being
512 * silly. This will save a ton of locking. But first we have to ensure
513 * that buffers are taken *off* the old inode's list when they are freed
514 * (presumably in truncate). That requires careful auditing of all
515 * filesystems (do it inside bforget()). It could also be done by bringing
516 * b_inode back.
520 * The buffer's backing address_space's private_lock must be held
522 static void __remove_assoc_queue(struct buffer_head *bh)
524 list_del_init(&bh->b_assoc_buffers);
525 WARN_ON(!bh->b_assoc_map);
526 if (buffer_write_io_error(bh))
527 set_bit(AS_EIO, &bh->b_assoc_map->flags);
528 bh->b_assoc_map = NULL;
531 int inode_has_buffers(struct inode *inode)
533 return !list_empty(&inode->i_data.private_list);
537 * osync is designed to support O_SYNC io. It waits synchronously for
538 * all already-submitted IO to complete, but does not queue any new
539 * writes to the disk.
541 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
542 * you dirty the buffers, and then use osync_inode_buffers to wait for
543 * completion. Any other dirty buffers which are not yet queued for
544 * write will not be flushed to disk by the osync.
546 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
548 struct buffer_head *bh;
549 struct list_head *p;
550 int err = 0;
552 spin_lock(lock);
553 repeat:
554 list_for_each_prev(p, list) {
555 bh = BH_ENTRY(p);
556 if (buffer_locked(bh)) {
557 get_bh(bh);
558 spin_unlock(lock);
559 wait_on_buffer(bh);
560 if (!buffer_uptodate(bh))
561 err = -EIO;
562 brelse(bh);
563 spin_lock(lock);
564 goto repeat;
567 spin_unlock(lock);
568 return err;
571 static void do_thaw_all(struct work_struct *work)
573 struct super_block *sb;
574 char b[BDEVNAME_SIZE];
576 spin_lock(&sb_lock);
577 restart:
578 list_for_each_entry(sb, &super_blocks, s_list) {
579 sb->s_count++;
580 spin_unlock(&sb_lock);
581 down_read(&sb->s_umount);
582 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
583 printk(KERN_WARNING "Emergency Thaw on %s\n",
584 bdevname(sb->s_bdev, b));
585 up_read(&sb->s_umount);
586 spin_lock(&sb_lock);
587 if (__put_super_and_need_restart(sb))
588 goto restart;
590 spin_unlock(&sb_lock);
591 kfree(work);
592 printk(KERN_WARNING "Emergency Thaw complete\n");
596 * emergency_thaw_all -- forcibly thaw every frozen filesystem
598 * Used for emergency unfreeze of all filesystems via SysRq
600 void emergency_thaw_all(void)
602 struct work_struct *work;
604 work = kmalloc(sizeof(*work), GFP_ATOMIC);
605 if (work) {
606 INIT_WORK(work, do_thaw_all);
607 schedule_work(work);
612 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
613 * @mapping: the mapping which wants those buffers written
615 * Starts I/O against the buffers at mapping->private_list, and waits upon
616 * that I/O.
618 * Basically, this is a convenience function for fsync().
619 * @mapping is a file or directory which needs those buffers to be written for
620 * a successful fsync().
622 int sync_mapping_buffers(struct address_space *mapping)
624 struct address_space *buffer_mapping = mapping->assoc_mapping;
626 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
627 return 0;
629 return fsync_buffers_list(&buffer_mapping->private_lock,
630 &mapping->private_list);
632 EXPORT_SYMBOL(sync_mapping_buffers);
635 * Called when we've recently written block `bblock', and it is known that
636 * `bblock' was for a buffer_boundary() buffer. This means that the block at
637 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
638 * dirty, schedule it for IO. So that indirects merge nicely with their data.
640 void write_boundary_block(struct block_device *bdev,
641 sector_t bblock, unsigned blocksize)
643 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
644 if (bh) {
645 if (buffer_dirty(bh))
646 ll_rw_block(WRITE, 1, &bh);
647 put_bh(bh);
651 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
653 struct address_space *mapping = inode->i_mapping;
654 struct address_space *buffer_mapping = bh->b_page->mapping;
656 mark_buffer_dirty(bh);
657 if (!mapping->assoc_mapping) {
658 mapping->assoc_mapping = buffer_mapping;
659 } else {
660 BUG_ON(mapping->assoc_mapping != buffer_mapping);
662 if (!bh->b_assoc_map) {
663 spin_lock(&buffer_mapping->private_lock);
664 list_move_tail(&bh->b_assoc_buffers,
665 &mapping->private_list);
666 bh->b_assoc_map = mapping;
667 spin_unlock(&buffer_mapping->private_lock);
670 EXPORT_SYMBOL(mark_buffer_dirty_inode);
673 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
674 * dirty.
676 * If warn is true, then emit a warning if the page is not uptodate and has
677 * not been truncated.
679 static void __set_page_dirty(struct page *page,
680 struct address_space *mapping, int warn)
682 spin_lock_irq(&mapping->tree_lock);
683 if (page->mapping) { /* Race with truncate? */
684 WARN_ON_ONCE(warn && !PageUptodate(page));
685 account_page_dirtied(page, mapping);
686 radix_tree_tag_set(&mapping->page_tree,
687 page_index(page), PAGECACHE_TAG_DIRTY);
689 spin_unlock_irq(&mapping->tree_lock);
690 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
694 * Add a page to the dirty page list.
696 * It is a sad fact of life that this function is called from several places
697 * deeply under spinlocking. It may not sleep.
699 * If the page has buffers, the uptodate buffers are set dirty, to preserve
700 * dirty-state coherency between the page and the buffers. It the page does
701 * not have buffers then when they are later attached they will all be set
702 * dirty.
704 * The buffers are dirtied before the page is dirtied. There's a small race
705 * window in which a writepage caller may see the page cleanness but not the
706 * buffer dirtiness. That's fine. If this code were to set the page dirty
707 * before the buffers, a concurrent writepage caller could clear the page dirty
708 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
709 * page on the dirty page list.
711 * We use private_lock to lock against try_to_free_buffers while using the
712 * page's buffer list. Also use this to protect against clean buffers being
713 * added to the page after it was set dirty.
715 * FIXME: may need to call ->reservepage here as well. That's rather up to the
716 * address_space though.
718 int __set_page_dirty_buffers(struct page *page)
720 int newly_dirty;
721 struct address_space *mapping = page_mapping(page);
723 if (unlikely(!mapping))
724 return !TestSetPageDirty(page);
726 spin_lock(&mapping->private_lock);
727 if (page_has_buffers(page)) {
728 struct buffer_head *head = page_buffers(page);
729 struct buffer_head *bh = head;
731 do {
732 set_buffer_dirty(bh);
733 bh = bh->b_this_page;
734 } while (bh != head);
736 newly_dirty = !TestSetPageDirty(page);
737 spin_unlock(&mapping->private_lock);
739 if (newly_dirty)
740 __set_page_dirty(page, mapping, 1);
741 return newly_dirty;
743 EXPORT_SYMBOL(__set_page_dirty_buffers);
746 * Write out and wait upon a list of buffers.
748 * We have conflicting pressures: we want to make sure that all
749 * initially dirty buffers get waited on, but that any subsequently
750 * dirtied buffers don't. After all, we don't want fsync to last
751 * forever if somebody is actively writing to the file.
753 * Do this in two main stages: first we copy dirty buffers to a
754 * temporary inode list, queueing the writes as we go. Then we clean
755 * up, waiting for those writes to complete.
757 * During this second stage, any subsequent updates to the file may end
758 * up refiling the buffer on the original inode's dirty list again, so
759 * there is a chance we will end up with a buffer queued for write but
760 * not yet completed on that list. So, as a final cleanup we go through
761 * the osync code to catch these locked, dirty buffers without requeuing
762 * any newly dirty buffers for write.
764 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
766 struct buffer_head *bh;
767 struct list_head tmp;
768 struct address_space *mapping, *prev_mapping = NULL;
769 int err = 0, err2;
771 INIT_LIST_HEAD(&tmp);
773 spin_lock(lock);
774 while (!list_empty(list)) {
775 bh = BH_ENTRY(list->next);
776 mapping = bh->b_assoc_map;
777 __remove_assoc_queue(bh);
778 /* Avoid race with mark_buffer_dirty_inode() which does
779 * a lockless check and we rely on seeing the dirty bit */
780 smp_mb();
781 if (buffer_dirty(bh) || buffer_locked(bh)) {
782 list_add(&bh->b_assoc_buffers, &tmp);
783 bh->b_assoc_map = mapping;
784 if (buffer_dirty(bh)) {
785 get_bh(bh);
786 spin_unlock(lock);
788 * Ensure any pending I/O completes so that
789 * ll_rw_block() actually writes the current
790 * contents - it is a noop if I/O is still in
791 * flight on potentially older contents.
793 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
796 * Kick off IO for the previous mapping. Note
797 * that we will not run the very last mapping,
798 * wait_on_buffer() will do that for us
799 * through sync_buffer().
801 if (prev_mapping && prev_mapping != mapping)
802 blk_run_address_space(prev_mapping);
803 prev_mapping = mapping;
805 brelse(bh);
806 spin_lock(lock);
811 while (!list_empty(&tmp)) {
812 bh = BH_ENTRY(tmp.prev);
813 get_bh(bh);
814 mapping = bh->b_assoc_map;
815 __remove_assoc_queue(bh);
816 /* Avoid race with mark_buffer_dirty_inode() which does
817 * a lockless check and we rely on seeing the dirty bit */
818 smp_mb();
819 if (buffer_dirty(bh)) {
820 list_add(&bh->b_assoc_buffers,
821 &mapping->private_list);
822 bh->b_assoc_map = mapping;
824 spin_unlock(lock);
825 wait_on_buffer(bh);
826 if (!buffer_uptodate(bh))
827 err = -EIO;
828 brelse(bh);
829 spin_lock(lock);
832 spin_unlock(lock);
833 err2 = osync_buffers_list(lock, list);
834 if (err)
835 return err;
836 else
837 return err2;
841 * Invalidate any and all dirty buffers on a given inode. We are
842 * probably unmounting the fs, but that doesn't mean we have already
843 * done a sync(). Just drop the buffers from the inode list.
845 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
846 * assumes that all the buffers are against the blockdev. Not true
847 * for reiserfs.
849 void invalidate_inode_buffers(struct inode *inode)
851 if (inode_has_buffers(inode)) {
852 struct address_space *mapping = &inode->i_data;
853 struct list_head *list = &mapping->private_list;
854 struct address_space *buffer_mapping = mapping->assoc_mapping;
856 spin_lock(&buffer_mapping->private_lock);
857 while (!list_empty(list))
858 __remove_assoc_queue(BH_ENTRY(list->next));
859 spin_unlock(&buffer_mapping->private_lock);
862 EXPORT_SYMBOL(invalidate_inode_buffers);
865 * Remove any clean buffers from the inode's buffer list. This is called
866 * when we're trying to free the inode itself. Those buffers can pin it.
868 * Returns true if all buffers were removed.
870 int remove_inode_buffers(struct inode *inode)
872 int ret = 1;
874 if (inode_has_buffers(inode)) {
875 struct address_space *mapping = &inode->i_data;
876 struct list_head *list = &mapping->private_list;
877 struct address_space *buffer_mapping = mapping->assoc_mapping;
879 spin_lock(&buffer_mapping->private_lock);
880 while (!list_empty(list)) {
881 struct buffer_head *bh = BH_ENTRY(list->next);
882 if (buffer_dirty(bh)) {
883 ret = 0;
884 break;
886 __remove_assoc_queue(bh);
888 spin_unlock(&buffer_mapping->private_lock);
890 return ret;
894 * Create the appropriate buffers when given a page for data area and
895 * the size of each buffer.. Use the bh->b_this_page linked list to
896 * follow the buffers created. Return NULL if unable to create more
897 * buffers.
899 * The retry flag is used to differentiate async IO (paging, swapping)
900 * which may not fail from ordinary buffer allocations.
902 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
903 int retry)
905 struct buffer_head *bh, *head;
906 long offset;
908 try_again:
909 head = NULL;
910 offset = PAGE_SIZE;
911 while ((offset -= size) >= 0) {
912 bh = alloc_buffer_head(GFP_NOFS);
913 if (!bh)
914 goto no_grow;
916 bh->b_bdev = NULL;
917 bh->b_this_page = head;
918 bh->b_blocknr = -1;
919 head = bh;
921 bh->b_state = 0;
922 atomic_set(&bh->b_count, 0);
923 bh->b_private = NULL;
924 bh->b_size = size;
926 /* Link the buffer to its page */
927 set_bh_page(bh, page, offset);
929 init_buffer(bh, NULL, NULL);
931 return head;
933 * In case anything failed, we just free everything we got.
935 no_grow:
936 if (head) {
937 do {
938 bh = head;
939 head = head->b_this_page;
940 free_buffer_head(bh);
941 } while (head);
945 * Return failure for non-async IO requests. Async IO requests
946 * are not allowed to fail, so we have to wait until buffer heads
947 * become available. But we don't want tasks sleeping with
948 * partially complete buffers, so all were released above.
950 if (!retry)
951 return NULL;
953 /* We're _really_ low on memory. Now we just
954 * wait for old buffer heads to become free due to
955 * finishing IO. Since this is an async request and
956 * the reserve list is empty, we're sure there are
957 * async buffer heads in use.
959 free_more_memory();
960 goto try_again;
962 EXPORT_SYMBOL_GPL(alloc_page_buffers);
964 static inline void
965 link_dev_buffers(struct page *page, struct buffer_head *head)
967 struct buffer_head *bh, *tail;
969 bh = head;
970 do {
971 tail = bh;
972 bh = bh->b_this_page;
973 } while (bh);
974 tail->b_this_page = head;
975 attach_page_buffers(page, head);
979 * Initialise the state of a blockdev page's buffers.
981 static void
982 init_page_buffers(struct page *page, struct block_device *bdev,
983 sector_t block, int size)
985 struct buffer_head *head = page_buffers(page);
986 struct buffer_head *bh = head;
987 int uptodate = PageUptodate(page);
989 do {
990 if (!buffer_mapped(bh)) {
991 init_buffer(bh, NULL, NULL);
992 bh->b_bdev = bdev;
993 bh->b_blocknr = block;
994 if (uptodate)
995 set_buffer_uptodate(bh);
996 set_buffer_mapped(bh);
998 block++;
999 bh = bh->b_this_page;
1000 } while (bh != head);
1004 * Create the page-cache page that contains the requested block.
1006 * This is user purely for blockdev mappings.
1008 static struct page *
1009 grow_dev_page(struct block_device *bdev, sector_t block,
1010 pgoff_t index, int size)
1012 struct inode *inode = bdev->bd_inode;
1013 struct page *page;
1014 struct buffer_head *bh;
1016 page = find_or_create_page(inode->i_mapping, index,
1017 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1018 if (!page)
1019 return NULL;
1021 BUG_ON(!PageLocked(page));
1023 if (page_has_buffers(page)) {
1024 bh = page_buffers(page);
1025 if (bh->b_size == size) {
1026 init_page_buffers(page, bdev, block, size);
1027 return page;
1029 if (!try_to_free_buffers(page))
1030 goto failed;
1034 * Allocate some buffers for this page
1036 bh = alloc_page_buffers(page, size, 0);
1037 if (!bh)
1038 goto failed;
1041 * Link the page to the buffers and initialise them. Take the
1042 * lock to be atomic wrt __find_get_block(), which does not
1043 * run under the page lock.
1045 spin_lock(&inode->i_mapping->private_lock);
1046 link_dev_buffers(page, bh);
1047 init_page_buffers(page, bdev, block, size);
1048 spin_unlock(&inode->i_mapping->private_lock);
1049 return page;
1051 failed:
1052 BUG();
1053 unlock_page(page);
1054 page_cache_release(page);
1055 return NULL;
1059 * Create buffers for the specified block device block's page. If
1060 * that page was dirty, the buffers are set dirty also.
1062 static int
1063 grow_buffers(struct block_device *bdev, sector_t block, int size)
1065 struct page *page;
1066 pgoff_t index;
1067 int sizebits;
1069 sizebits = -1;
1070 do {
1071 sizebits++;
1072 } while ((size << sizebits) < PAGE_SIZE);
1074 index = block >> sizebits;
1077 * Check for a block which wants to lie outside our maximum possible
1078 * pagecache index. (this comparison is done using sector_t types).
1080 if (unlikely(index != block >> sizebits)) {
1081 char b[BDEVNAME_SIZE];
1083 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1084 "device %s\n",
1085 __func__, (unsigned long long)block,
1086 bdevname(bdev, b));
1087 return -EIO;
1089 block = index << sizebits;
1090 /* Create a page with the proper size buffers.. */
1091 page = grow_dev_page(bdev, block, index, size);
1092 if (!page)
1093 return 0;
1094 unlock_page(page);
1095 page_cache_release(page);
1096 return 1;
1099 static struct buffer_head *
1100 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1102 /* Size must be multiple of hard sectorsize */
1103 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1104 (size < 512 || size > PAGE_SIZE))) {
1105 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1106 size);
1107 printk(KERN_ERR "logical block size: %d\n",
1108 bdev_logical_block_size(bdev));
1110 dump_stack();
1111 return NULL;
1114 #if (BITS_PER_LONG == 32) && defined(CONFIG_LBD)
1115 if ((block >> (PAGE_CACHE_SHIFT - bdev->bd_inode->i_blkbits)) &
1116 0xffffffff00000000ULL) {
1118 * We'll fail because the block is outside the range
1119 * which a 32-bit pagecache index can address
1121 printk(KERN_ERR "getblk(): sector number too large for 32-bit"
1122 "machines\n");
1123 dump_stack();
1124 return NULL;
1126 #endif
1128 for (;;) {
1129 struct buffer_head * bh;
1130 int ret;
1132 bh = __find_get_block(bdev, block, size);
1133 if (bh)
1134 return bh;
1136 ret = grow_buffers(bdev, block, size);
1137 if (ret < 0)
1138 return NULL;
1139 if (ret == 0)
1140 free_more_memory();
1145 * The relationship between dirty buffers and dirty pages:
1147 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1148 * the page is tagged dirty in its radix tree.
1150 * At all times, the dirtiness of the buffers represents the dirtiness of
1151 * subsections of the page. If the page has buffers, the page dirty bit is
1152 * merely a hint about the true dirty state.
1154 * When a page is set dirty in its entirety, all its buffers are marked dirty
1155 * (if the page has buffers).
1157 * When a buffer is marked dirty, its page is dirtied, but the page's other
1158 * buffers are not.
1160 * Also. When blockdev buffers are explicitly read with bread(), they
1161 * individually become uptodate. But their backing page remains not
1162 * uptodate - even if all of its buffers are uptodate. A subsequent
1163 * block_read_full_page() against that page will discover all the uptodate
1164 * buffers, will set the page uptodate and will perform no I/O.
1168 * mark_buffer_dirty - mark a buffer_head as needing writeout
1169 * @bh: the buffer_head to mark dirty
1171 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1172 * backing page dirty, then tag the page as dirty in its address_space's radix
1173 * tree and then attach the address_space's inode to its superblock's dirty
1174 * inode list.
1176 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1177 * mapping->tree_lock and the global inode_lock.
1179 void mark_buffer_dirty(struct buffer_head *bh)
1181 WARN_ON_ONCE(!buffer_uptodate(bh));
1184 * Very *carefully* optimize the it-is-already-dirty case.
1186 * Don't let the final "is it dirty" escape to before we
1187 * perhaps modified the buffer.
1189 if (buffer_dirty(bh)) {
1190 smp_mb();
1191 if (buffer_dirty(bh))
1192 return;
1195 if (!test_set_buffer_dirty(bh)) {
1196 struct page *page = bh->b_page;
1197 if (!TestSetPageDirty(page)) {
1198 struct address_space *mapping = page_mapping(page);
1199 if (mapping)
1200 __set_page_dirty(page, mapping, 0);
1204 EXPORT_SYMBOL(mark_buffer_dirty);
1207 * Decrement a buffer_head's reference count. If all buffers against a page
1208 * have zero reference count, are clean and unlocked, and if the page is clean
1209 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1210 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1211 * a page but it ends up not being freed, and buffers may later be reattached).
1213 void __brelse(struct buffer_head * buf)
1215 if (atomic_read(&buf->b_count)) {
1216 put_bh(buf);
1217 return;
1219 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1221 EXPORT_SYMBOL(__brelse);
1224 * bforget() is like brelse(), except it discards any
1225 * potentially dirty data.
1227 void __bforget(struct buffer_head *bh)
1229 clear_buffer_dirty(bh);
1230 if (bh->b_assoc_map) {
1231 struct address_space *buffer_mapping = bh->b_page->mapping;
1233 spin_lock(&buffer_mapping->private_lock);
1234 list_del_init(&bh->b_assoc_buffers);
1235 bh->b_assoc_map = NULL;
1236 spin_unlock(&buffer_mapping->private_lock);
1238 __brelse(bh);
1240 EXPORT_SYMBOL(__bforget);
1242 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1244 lock_buffer(bh);
1245 if (buffer_uptodate(bh)) {
1246 unlock_buffer(bh);
1247 return bh;
1248 } else {
1249 get_bh(bh);
1250 bh->b_end_io = end_buffer_read_sync;
1251 submit_bh(READ, bh);
1252 wait_on_buffer(bh);
1253 if (buffer_uptodate(bh))
1254 return bh;
1256 brelse(bh);
1257 return NULL;
1261 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1262 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1263 * refcount elevated by one when they're in an LRU. A buffer can only appear
1264 * once in a particular CPU's LRU. A single buffer can be present in multiple
1265 * CPU's LRUs at the same time.
1267 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1268 * sb_find_get_block().
1270 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1271 * a local interrupt disable for that.
1274 #define BH_LRU_SIZE 8
1276 struct bh_lru {
1277 struct buffer_head *bhs[BH_LRU_SIZE];
1280 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1282 #ifdef CONFIG_SMP
1283 #define bh_lru_lock() local_irq_disable()
1284 #define bh_lru_unlock() local_irq_enable()
1285 #else
1286 #define bh_lru_lock() preempt_disable()
1287 #define bh_lru_unlock() preempt_enable()
1288 #endif
1290 static inline void check_irqs_on(void)
1292 #ifdef irqs_disabled
1293 BUG_ON(irqs_disabled());
1294 #endif
1298 * The LRU management algorithm is dopey-but-simple. Sorry.
1300 static void bh_lru_install(struct buffer_head *bh)
1302 struct buffer_head *evictee = NULL;
1303 struct bh_lru *lru;
1305 check_irqs_on();
1306 bh_lru_lock();
1307 lru = &__get_cpu_var(bh_lrus);
1308 if (lru->bhs[0] != bh) {
1309 struct buffer_head *bhs[BH_LRU_SIZE];
1310 int in;
1311 int out = 0;
1313 get_bh(bh);
1314 bhs[out++] = bh;
1315 for (in = 0; in < BH_LRU_SIZE; in++) {
1316 struct buffer_head *bh2 = lru->bhs[in];
1318 if (bh2 == bh) {
1319 __brelse(bh2);
1320 } else {
1321 if (out >= BH_LRU_SIZE) {
1322 BUG_ON(evictee != NULL);
1323 evictee = bh2;
1324 } else {
1325 bhs[out++] = bh2;
1329 while (out < BH_LRU_SIZE)
1330 bhs[out++] = NULL;
1331 memcpy(lru->bhs, bhs, sizeof(bhs));
1333 bh_lru_unlock();
1335 if (evictee)
1336 __brelse(evictee);
1340 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1342 static struct buffer_head *
1343 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1345 struct buffer_head *ret = NULL;
1346 struct bh_lru *lru;
1347 unsigned int i;
1349 check_irqs_on();
1350 bh_lru_lock();
1351 lru = &__get_cpu_var(bh_lrus);
1352 for (i = 0; i < BH_LRU_SIZE; i++) {
1353 struct buffer_head *bh = lru->bhs[i];
1355 if (bh && bh->b_bdev == bdev &&
1356 bh->b_blocknr == block && bh->b_size == size) {
1357 if (i) {
1358 while (i) {
1359 lru->bhs[i] = lru->bhs[i - 1];
1360 i--;
1362 lru->bhs[0] = bh;
1364 get_bh(bh);
1365 ret = bh;
1366 break;
1369 bh_lru_unlock();
1370 return ret;
1374 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1375 * it in the LRU and mark it as accessed. If it is not present then return
1376 * NULL
1378 struct buffer_head *
1379 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1381 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1383 if (bh == NULL) {
1384 bh = __find_get_block_slow(bdev, block);
1385 if (bh)
1386 bh_lru_install(bh);
1388 if (bh)
1389 touch_buffer(bh);
1390 return bh;
1392 EXPORT_SYMBOL(__find_get_block);
1395 * __getblk will locate (and, if necessary, create) the buffer_head
1396 * which corresponds to the passed block_device, block and size. The
1397 * returned buffer has its reference count incremented.
1399 * __getblk() cannot fail - it just keeps trying. If you pass it an
1400 * illegal block number, __getblk() will happily return a buffer_head
1401 * which represents the non-existent block. Very weird.
1403 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1404 * attempt is failing. FIXME, perhaps?
1406 struct buffer_head *
1407 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1409 struct buffer_head *bh = __find_get_block(bdev, block, size);
1411 might_sleep();
1412 if (bh == NULL)
1413 bh = __getblk_slow(bdev, block, size);
1414 return bh;
1416 EXPORT_SYMBOL(__getblk);
1419 * Do async read-ahead on a buffer..
1421 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1423 struct buffer_head *bh = __getblk(bdev, block, size);
1424 if (likely(bh)) {
1425 ll_rw_block(READA, 1, &bh);
1426 brelse(bh);
1429 EXPORT_SYMBOL(__breadahead);
1432 * __bread() - reads a specified block and returns the bh
1433 * @bdev: the block_device to read from
1434 * @block: number of block
1435 * @size: size (in bytes) to read
1437 * Reads a specified block, and returns buffer head that contains it.
1438 * It returns NULL if the block was unreadable.
1440 struct buffer_head *
1441 __bread(struct block_device *bdev, sector_t block, unsigned size)
1443 struct buffer_head *bh = __getblk(bdev, block, size);
1445 if (likely(bh) && !buffer_uptodate(bh))
1446 bh = __bread_slow(bh);
1447 return bh;
1449 EXPORT_SYMBOL(__bread);
1452 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1453 * This doesn't race because it runs in each cpu either in irq
1454 * or with preempt disabled.
1456 static void invalidate_bh_lru(void *arg)
1458 struct bh_lru *b = &get_cpu_var(bh_lrus);
1459 int i;
1461 for (i = 0; i < BH_LRU_SIZE; i++) {
1462 brelse(b->bhs[i]);
1463 b->bhs[i] = NULL;
1465 put_cpu_var(bh_lrus);
1468 void invalidate_bh_lrus(void)
1470 on_each_cpu(invalidate_bh_lru, NULL, 1);
1472 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1474 void set_bh_page(struct buffer_head *bh,
1475 struct page *page, unsigned long offset)
1477 bh->b_page = page;
1478 BUG_ON(offset >= PAGE_SIZE);
1479 if (PageHighMem(page))
1481 * This catches illegal uses and preserves the offset:
1483 bh->b_data = (char *)(0 + offset);
1484 else
1485 bh->b_data = page_address(page) + offset;
1487 EXPORT_SYMBOL(set_bh_page);
1490 * Called when truncating a buffer on a page completely.
1492 static void discard_buffer(struct buffer_head * bh)
1494 lock_buffer(bh);
1495 clear_buffer_dirty(bh);
1496 bh->b_bdev = NULL;
1497 clear_buffer_mapped(bh);
1498 clear_buffer_req(bh);
1499 clear_buffer_new(bh);
1500 clear_buffer_delay(bh);
1501 clear_buffer_unwritten(bh);
1502 unlock_buffer(bh);
1506 * block_invalidatepage - invalidate part of all of a buffer-backed page
1508 * @page: the page which is affected
1509 * @offset: the index of the truncation point
1511 * block_invalidatepage() is called when all or part of the page has become
1512 * invalidatedby a truncate operation.
1514 * block_invalidatepage() does not have to release all buffers, but it must
1515 * ensure that no dirty buffer is left outside @offset and that no I/O
1516 * is underway against any of the blocks which are outside the truncation
1517 * point. Because the caller is about to free (and possibly reuse) those
1518 * blocks on-disk.
1520 void block_invalidatepage(struct page *page, unsigned long offset)
1522 struct buffer_head *head, *bh, *next;
1523 unsigned int curr_off = 0;
1525 BUG_ON(!PageLocked(page));
1526 if (!page_has_buffers(page))
1527 goto out;
1529 head = page_buffers(page);
1530 bh = head;
1531 do {
1532 unsigned int next_off = curr_off + bh->b_size;
1533 next = bh->b_this_page;
1536 * is this block fully invalidated?
1538 if (offset <= curr_off)
1539 discard_buffer(bh);
1540 curr_off = next_off;
1541 bh = next;
1542 } while (bh != head);
1545 * We release buffers only if the entire page is being invalidated.
1546 * The get_block cached value has been unconditionally invalidated,
1547 * so real IO is not possible anymore.
1549 if (offset == 0)
1550 try_to_release_page(page, 0);
1551 out:
1552 return;
1554 EXPORT_SYMBOL(block_invalidatepage);
1557 * We attach and possibly dirty the buffers atomically wrt
1558 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1559 * is already excluded via the page lock.
1561 void create_empty_buffers(struct page *page,
1562 unsigned long blocksize, unsigned long b_state)
1564 struct buffer_head *bh, *head, *tail;
1566 head = alloc_page_buffers(page, blocksize, 1);
1567 bh = head;
1568 do {
1569 bh->b_state |= b_state;
1570 tail = bh;
1571 bh = bh->b_this_page;
1572 } while (bh);
1573 tail->b_this_page = head;
1575 spin_lock(&page->mapping->private_lock);
1576 if (PageUptodate(page) || PageDirty(page)) {
1577 bh = head;
1578 do {
1579 if (PageDirty(page))
1580 set_buffer_dirty(bh);
1581 if (PageUptodate(page))
1582 set_buffer_uptodate(bh);
1583 bh = bh->b_this_page;
1584 } while (bh != head);
1586 attach_page_buffers(page, head);
1587 spin_unlock(&page->mapping->private_lock);
1589 EXPORT_SYMBOL(create_empty_buffers);
1592 * We are taking a block for data and we don't want any output from any
1593 * buffer-cache aliases starting from return from that function and
1594 * until the moment when something will explicitly mark the buffer
1595 * dirty (hopefully that will not happen until we will free that block ;-)
1596 * We don't even need to mark it not-uptodate - nobody can expect
1597 * anything from a newly allocated buffer anyway. We used to used
1598 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1599 * don't want to mark the alias unmapped, for example - it would confuse
1600 * anyone who might pick it with bread() afterwards...
1602 * Also.. Note that bforget() doesn't lock the buffer. So there can
1603 * be writeout I/O going on against recently-freed buffers. We don't
1604 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1605 * only if we really need to. That happens here.
1607 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1609 struct buffer_head *old_bh;
1611 might_sleep();
1613 old_bh = __find_get_block_slow(bdev, block);
1614 if (old_bh) {
1615 clear_buffer_dirty(old_bh);
1616 wait_on_buffer(old_bh);
1617 clear_buffer_req(old_bh);
1618 __brelse(old_bh);
1621 EXPORT_SYMBOL(unmap_underlying_metadata);
1624 * NOTE! All mapped/uptodate combinations are valid:
1626 * Mapped Uptodate Meaning
1628 * No No "unknown" - must do get_block()
1629 * No Yes "hole" - zero-filled
1630 * Yes No "allocated" - allocated on disk, not read in
1631 * Yes Yes "valid" - allocated and up-to-date in memory.
1633 * "Dirty" is valid only with the last case (mapped+uptodate).
1637 * While block_write_full_page is writing back the dirty buffers under
1638 * the page lock, whoever dirtied the buffers may decide to clean them
1639 * again at any time. We handle that by only looking at the buffer
1640 * state inside lock_buffer().
1642 * If block_write_full_page() is called for regular writeback
1643 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1644 * locked buffer. This only can happen if someone has written the buffer
1645 * directly, with submit_bh(). At the address_space level PageWriteback
1646 * prevents this contention from occurring.
1648 * If block_write_full_page() is called with wbc->sync_mode ==
1649 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1650 * causes the writes to be flagged as synchronous writes, but the
1651 * block device queue will NOT be unplugged, since usually many pages
1652 * will be pushed to the out before the higher-level caller actually
1653 * waits for the writes to be completed. The various wait functions,
1654 * such as wait_on_writeback_range() will ultimately call sync_page()
1655 * which will ultimately call blk_run_backing_dev(), which will end up
1656 * unplugging the device queue.
1658 static int __block_write_full_page(struct inode *inode, struct page *page,
1659 get_block_t *get_block, struct writeback_control *wbc,
1660 bh_end_io_t *handler)
1662 int err;
1663 sector_t block;
1664 sector_t last_block;
1665 struct buffer_head *bh, *head;
1666 const unsigned blocksize = 1 << inode->i_blkbits;
1667 int nr_underway = 0;
1668 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1669 WRITE_SYNC_PLUG : WRITE);
1671 BUG_ON(!PageLocked(page));
1673 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1675 if (!page_has_buffers(page)) {
1676 create_empty_buffers(page, blocksize,
1677 (1 << BH_Dirty)|(1 << BH_Uptodate));
1681 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1682 * here, and the (potentially unmapped) buffers may become dirty at
1683 * any time. If a buffer becomes dirty here after we've inspected it
1684 * then we just miss that fact, and the page stays dirty.
1686 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1687 * handle that here by just cleaning them.
1690 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1691 head = page_buffers(page);
1692 bh = head;
1695 * Get all the dirty buffers mapped to disk addresses and
1696 * handle any aliases from the underlying blockdev's mapping.
1698 do {
1699 if (block > last_block) {
1701 * mapped buffers outside i_size will occur, because
1702 * this page can be outside i_size when there is a
1703 * truncate in progress.
1706 * The buffer was zeroed by block_write_full_page()
1708 clear_buffer_dirty(bh);
1709 set_buffer_uptodate(bh);
1710 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1711 buffer_dirty(bh)) {
1712 WARN_ON(bh->b_size != blocksize);
1713 err = get_block(inode, block, bh, 1);
1714 if (err)
1715 goto recover;
1716 clear_buffer_delay(bh);
1717 if (buffer_new(bh)) {
1718 /* blockdev mappings never come here */
1719 clear_buffer_new(bh);
1720 unmap_underlying_metadata(bh->b_bdev,
1721 bh->b_blocknr);
1724 bh = bh->b_this_page;
1725 block++;
1726 } while (bh != head);
1728 do {
1729 if (!buffer_mapped(bh))
1730 continue;
1732 * If it's a fully non-blocking write attempt and we cannot
1733 * lock the buffer then redirty the page. Note that this can
1734 * potentially cause a busy-wait loop from writeback threads
1735 * and kswapd activity, but those code paths have their own
1736 * higher-level throttling.
1738 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1739 lock_buffer(bh);
1740 } else if (!trylock_buffer(bh)) {
1741 redirty_page_for_writepage(wbc, page);
1742 continue;
1744 if (test_clear_buffer_dirty(bh)) {
1745 mark_buffer_async_write_endio(bh, handler);
1746 } else {
1747 unlock_buffer(bh);
1749 } while ((bh = bh->b_this_page) != head);
1752 * The page and its buffers are protected by PageWriteback(), so we can
1753 * drop the bh refcounts early.
1755 BUG_ON(PageWriteback(page));
1756 set_page_writeback(page);
1758 do {
1759 struct buffer_head *next = bh->b_this_page;
1760 if (buffer_async_write(bh)) {
1761 submit_bh(write_op, bh);
1762 nr_underway++;
1764 bh = next;
1765 } while (bh != head);
1766 unlock_page(page);
1768 err = 0;
1769 done:
1770 if (nr_underway == 0) {
1772 * The page was marked dirty, but the buffers were
1773 * clean. Someone wrote them back by hand with
1774 * ll_rw_block/submit_bh. A rare case.
1776 end_page_writeback(page);
1779 * The page and buffer_heads can be released at any time from
1780 * here on.
1783 return err;
1785 recover:
1787 * ENOSPC, or some other error. We may already have added some
1788 * blocks to the file, so we need to write these out to avoid
1789 * exposing stale data.
1790 * The page is currently locked and not marked for writeback
1792 bh = head;
1793 /* Recovery: lock and submit the mapped buffers */
1794 do {
1795 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1796 !buffer_delay(bh)) {
1797 lock_buffer(bh);
1798 mark_buffer_async_write_endio(bh, handler);
1799 } else {
1801 * The buffer may have been set dirty during
1802 * attachment to a dirty page.
1804 clear_buffer_dirty(bh);
1806 } while ((bh = bh->b_this_page) != head);
1807 SetPageError(page);
1808 BUG_ON(PageWriteback(page));
1809 mapping_set_error(page->mapping, err);
1810 set_page_writeback(page);
1811 do {
1812 struct buffer_head *next = bh->b_this_page;
1813 if (buffer_async_write(bh)) {
1814 clear_buffer_dirty(bh);
1815 submit_bh(write_op, bh);
1816 nr_underway++;
1818 bh = next;
1819 } while (bh != head);
1820 unlock_page(page);
1821 goto done;
1825 * If a page has any new buffers, zero them out here, and mark them uptodate
1826 * and dirty so they'll be written out (in order to prevent uninitialised
1827 * block data from leaking). And clear the new bit.
1829 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1831 unsigned int block_start, block_end;
1832 struct buffer_head *head, *bh;
1834 BUG_ON(!PageLocked(page));
1835 if (!page_has_buffers(page))
1836 return;
1838 bh = head = page_buffers(page);
1839 block_start = 0;
1840 do {
1841 block_end = block_start + bh->b_size;
1843 if (buffer_new(bh)) {
1844 if (block_end > from && block_start < to) {
1845 if (!PageUptodate(page)) {
1846 unsigned start, size;
1848 start = max(from, block_start);
1849 size = min(to, block_end) - start;
1851 zero_user(page, start, size);
1852 set_buffer_uptodate(bh);
1855 clear_buffer_new(bh);
1856 mark_buffer_dirty(bh);
1860 block_start = block_end;
1861 bh = bh->b_this_page;
1862 } while (bh != head);
1864 EXPORT_SYMBOL(page_zero_new_buffers);
1866 static int __block_prepare_write(struct inode *inode, struct page *page,
1867 unsigned from, unsigned to, get_block_t *get_block)
1869 unsigned block_start, block_end;
1870 sector_t block;
1871 int err = 0;
1872 unsigned blocksize, bbits;
1873 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1875 BUG_ON(!PageLocked(page));
1876 BUG_ON(from > PAGE_CACHE_SIZE);
1877 BUG_ON(to > PAGE_CACHE_SIZE);
1878 BUG_ON(from > to);
1880 blocksize = 1 << inode->i_blkbits;
1881 if (!page_has_buffers(page))
1882 create_empty_buffers(page, blocksize, 0);
1883 head = page_buffers(page);
1885 bbits = inode->i_blkbits;
1886 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1888 for(bh = head, block_start = 0; bh != head || !block_start;
1889 block++, block_start=block_end, bh = bh->b_this_page) {
1890 block_end = block_start + blocksize;
1891 if (block_end <= from || block_start >= to) {
1892 if (PageUptodate(page)) {
1893 if (!buffer_uptodate(bh))
1894 set_buffer_uptodate(bh);
1896 continue;
1898 if (buffer_new(bh))
1899 clear_buffer_new(bh);
1900 if (!buffer_mapped(bh)) {
1901 WARN_ON(bh->b_size != blocksize);
1902 err = get_block(inode, block, bh, 1);
1903 if (err)
1904 break;
1905 if (buffer_new(bh)) {
1906 unmap_underlying_metadata(bh->b_bdev,
1907 bh->b_blocknr);
1908 if (PageUptodate(page)) {
1909 clear_buffer_new(bh);
1910 set_buffer_uptodate(bh);
1911 mark_buffer_dirty(bh);
1912 continue;
1914 if (block_end > to || block_start < from)
1915 zero_user_segments(page,
1916 to, block_end,
1917 block_start, from);
1918 continue;
1921 if (PageUptodate(page)) {
1922 if (!buffer_uptodate(bh))
1923 set_buffer_uptodate(bh);
1924 continue;
1926 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1927 !buffer_unwritten(bh) &&
1928 (block_start < from || block_end > to)) {
1929 ll_rw_block(READ, 1, &bh);
1930 *wait_bh++=bh;
1934 * If we issued read requests - let them complete.
1936 while(wait_bh > wait) {
1937 wait_on_buffer(*--wait_bh);
1938 if (!buffer_uptodate(*wait_bh))
1939 err = -EIO;
1941 if (unlikely(err))
1942 page_zero_new_buffers(page, from, to);
1943 return err;
1946 static int __block_commit_write(struct inode *inode, struct page *page,
1947 unsigned from, unsigned to)
1949 unsigned block_start, block_end;
1950 int partial = 0;
1951 unsigned blocksize;
1952 struct buffer_head *bh, *head;
1954 blocksize = 1 << inode->i_blkbits;
1956 for(bh = head = page_buffers(page), block_start = 0;
1957 bh != head || !block_start;
1958 block_start=block_end, bh = bh->b_this_page) {
1959 block_end = block_start + blocksize;
1960 if (block_end <= from || block_start >= to) {
1961 if (!buffer_uptodate(bh))
1962 partial = 1;
1963 } else {
1964 set_buffer_uptodate(bh);
1965 mark_buffer_dirty(bh);
1967 clear_buffer_new(bh);
1971 * If this is a partial write which happened to make all buffers
1972 * uptodate then we can optimize away a bogus readpage() for
1973 * the next read(). Here we 'discover' whether the page went
1974 * uptodate as a result of this (potentially partial) write.
1976 if (!partial)
1977 SetPageUptodate(page);
1978 return 0;
1982 * block_write_begin takes care of the basic task of block allocation and
1983 * bringing partial write blocks uptodate first.
1985 * If *pagep is not NULL, then block_write_begin uses the locked page
1986 * at *pagep rather than allocating its own. In this case, the page will
1987 * not be unlocked or deallocated on failure.
1989 int block_write_begin(struct file *file, struct address_space *mapping,
1990 loff_t pos, unsigned len, unsigned flags,
1991 struct page **pagep, void **fsdata,
1992 get_block_t *get_block)
1994 struct inode *inode = mapping->host;
1995 int status = 0;
1996 struct page *page;
1997 pgoff_t index;
1998 unsigned start, end;
1999 int ownpage = 0;
2001 index = pos >> PAGE_CACHE_SHIFT;
2002 start = pos & (PAGE_CACHE_SIZE - 1);
2003 end = start + len;
2005 page = *pagep;
2006 if (page == NULL) {
2007 ownpage = 1;
2008 page = grab_cache_page_write_begin(mapping, index, flags);
2009 if (!page) {
2010 status = -ENOMEM;
2011 goto out;
2013 *pagep = page;
2014 } else
2015 BUG_ON(!PageLocked(page));
2017 status = __block_prepare_write(inode, page, start, end, get_block);
2018 if (unlikely(status)) {
2019 ClearPageUptodate(page);
2021 if (ownpage) {
2022 unlock_page(page);
2023 page_cache_release(page);
2024 *pagep = NULL;
2027 * prepare_write() may have instantiated a few blocks
2028 * outside i_size. Trim these off again. Don't need
2029 * i_size_read because we hold i_mutex.
2031 if (pos + len > inode->i_size)
2032 vmtruncate(inode, inode->i_size);
2036 out:
2037 return status;
2039 EXPORT_SYMBOL(block_write_begin);
2041 int block_write_end(struct file *file, struct address_space *mapping,
2042 loff_t pos, unsigned len, unsigned copied,
2043 struct page *page, void *fsdata)
2045 struct inode *inode = mapping->host;
2046 unsigned start;
2048 start = pos & (PAGE_CACHE_SIZE - 1);
2050 if (unlikely(copied < len)) {
2052 * The buffers that were written will now be uptodate, so we
2053 * don't have to worry about a readpage reading them and
2054 * overwriting a partial write. However if we have encountered
2055 * a short write and only partially written into a buffer, it
2056 * will not be marked uptodate, so a readpage might come in and
2057 * destroy our partial write.
2059 * Do the simplest thing, and just treat any short write to a
2060 * non uptodate page as a zero-length write, and force the
2061 * caller to redo the whole thing.
2063 if (!PageUptodate(page))
2064 copied = 0;
2066 page_zero_new_buffers(page, start+copied, start+len);
2068 flush_dcache_page(page);
2070 /* This could be a short (even 0-length) commit */
2071 __block_commit_write(inode, page, start, start+copied);
2073 return copied;
2075 EXPORT_SYMBOL(block_write_end);
2077 int generic_write_end(struct file *file, struct address_space *mapping,
2078 loff_t pos, unsigned len, unsigned copied,
2079 struct page *page, void *fsdata)
2081 struct inode *inode = mapping->host;
2082 int i_size_changed = 0;
2084 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2087 * No need to use i_size_read() here, the i_size
2088 * cannot change under us because we hold i_mutex.
2090 * But it's important to update i_size while still holding page lock:
2091 * page writeout could otherwise come in and zero beyond i_size.
2093 if (pos+copied > inode->i_size) {
2094 i_size_write(inode, pos+copied);
2095 i_size_changed = 1;
2098 unlock_page(page);
2099 page_cache_release(page);
2102 * Don't mark the inode dirty under page lock. First, it unnecessarily
2103 * makes the holding time of page lock longer. Second, it forces lock
2104 * ordering of page lock and transaction start for journaling
2105 * filesystems.
2107 if (i_size_changed)
2108 mark_inode_dirty(inode);
2110 return copied;
2112 EXPORT_SYMBOL(generic_write_end);
2115 * block_is_partially_uptodate checks whether buffers within a page are
2116 * uptodate or not.
2118 * Returns true if all buffers which correspond to a file portion
2119 * we want to read are uptodate.
2121 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2122 unsigned long from)
2124 struct inode *inode = page->mapping->host;
2125 unsigned block_start, block_end, blocksize;
2126 unsigned to;
2127 struct buffer_head *bh, *head;
2128 int ret = 1;
2130 if (!page_has_buffers(page))
2131 return 0;
2133 blocksize = 1 << inode->i_blkbits;
2134 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2135 to = from + to;
2136 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2137 return 0;
2139 head = page_buffers(page);
2140 bh = head;
2141 block_start = 0;
2142 do {
2143 block_end = block_start + blocksize;
2144 if (block_end > from && block_start < to) {
2145 if (!buffer_uptodate(bh)) {
2146 ret = 0;
2147 break;
2149 if (block_end >= to)
2150 break;
2152 block_start = block_end;
2153 bh = bh->b_this_page;
2154 } while (bh != head);
2156 return ret;
2158 EXPORT_SYMBOL(block_is_partially_uptodate);
2161 * Generic "read page" function for block devices that have the normal
2162 * get_block functionality. This is most of the block device filesystems.
2163 * Reads the page asynchronously --- the unlock_buffer() and
2164 * set/clear_buffer_uptodate() functions propagate buffer state into the
2165 * page struct once IO has completed.
2167 int block_read_full_page(struct page *page, get_block_t *get_block)
2169 struct inode *inode = page->mapping->host;
2170 sector_t iblock, lblock;
2171 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2172 unsigned int blocksize;
2173 int nr, i;
2174 int fully_mapped = 1;
2176 BUG_ON(!PageLocked(page));
2177 blocksize = 1 << inode->i_blkbits;
2178 if (!page_has_buffers(page))
2179 create_empty_buffers(page, blocksize, 0);
2180 head = page_buffers(page);
2182 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2183 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2184 bh = head;
2185 nr = 0;
2186 i = 0;
2188 do {
2189 if (buffer_uptodate(bh))
2190 continue;
2192 if (!buffer_mapped(bh)) {
2193 int err = 0;
2195 fully_mapped = 0;
2196 if (iblock < lblock) {
2197 WARN_ON(bh->b_size != blocksize);
2198 err = get_block(inode, iblock, bh, 0);
2199 if (err)
2200 SetPageError(page);
2202 if (!buffer_mapped(bh)) {
2203 zero_user(page, i * blocksize, blocksize);
2204 if (!err)
2205 set_buffer_uptodate(bh);
2206 continue;
2209 * get_block() might have updated the buffer
2210 * synchronously
2212 if (buffer_uptodate(bh))
2213 continue;
2215 arr[nr++] = bh;
2216 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2218 if (fully_mapped)
2219 SetPageMappedToDisk(page);
2221 if (!nr) {
2223 * All buffers are uptodate - we can set the page uptodate
2224 * as well. But not if get_block() returned an error.
2226 if (!PageError(page))
2227 SetPageUptodate(page);
2228 unlock_page(page);
2229 return 0;
2232 /* Stage two: lock the buffers */
2233 for (i = 0; i < nr; i++) {
2234 bh = arr[i];
2235 lock_buffer(bh);
2236 mark_buffer_async_read(bh);
2240 * Stage 3: start the IO. Check for uptodateness
2241 * inside the buffer lock in case another process reading
2242 * the underlying blockdev brought it uptodate (the sct fix).
2244 for (i = 0; i < nr; i++) {
2245 bh = arr[i];
2246 if (buffer_uptodate(bh))
2247 end_buffer_async_read(bh, 1);
2248 else
2249 submit_bh(READ, bh);
2251 return 0;
2253 EXPORT_SYMBOL(block_read_full_page);
2255 /* utility function for filesystems that need to do work on expanding
2256 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2257 * deal with the hole.
2259 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2261 struct address_space *mapping = inode->i_mapping;
2262 struct page *page;
2263 void *fsdata;
2264 int err;
2266 err = inode_newsize_ok(inode, size);
2267 if (err)
2268 goto out;
2270 err = pagecache_write_begin(NULL, mapping, size, 0,
2271 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2272 &page, &fsdata);
2273 if (err)
2274 goto out;
2276 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2277 BUG_ON(err > 0);
2279 out:
2280 return err;
2282 EXPORT_SYMBOL(generic_cont_expand_simple);
2284 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2285 loff_t pos, loff_t *bytes)
2287 struct inode *inode = mapping->host;
2288 unsigned blocksize = 1 << inode->i_blkbits;
2289 struct page *page;
2290 void *fsdata;
2291 pgoff_t index, curidx;
2292 loff_t curpos;
2293 unsigned zerofrom, offset, len;
2294 int err = 0;
2296 index = pos >> PAGE_CACHE_SHIFT;
2297 offset = pos & ~PAGE_CACHE_MASK;
2299 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2300 zerofrom = curpos & ~PAGE_CACHE_MASK;
2301 if (zerofrom & (blocksize-1)) {
2302 *bytes |= (blocksize-1);
2303 (*bytes)++;
2305 len = PAGE_CACHE_SIZE - zerofrom;
2307 err = pagecache_write_begin(file, mapping, curpos, len,
2308 AOP_FLAG_UNINTERRUPTIBLE,
2309 &page, &fsdata);
2310 if (err)
2311 goto out;
2312 zero_user(page, zerofrom, len);
2313 err = pagecache_write_end(file, mapping, curpos, len, len,
2314 page, fsdata);
2315 if (err < 0)
2316 goto out;
2317 BUG_ON(err != len);
2318 err = 0;
2320 balance_dirty_pages_ratelimited(mapping);
2323 /* page covers the boundary, find the boundary offset */
2324 if (index == curidx) {
2325 zerofrom = curpos & ~PAGE_CACHE_MASK;
2326 /* if we will expand the thing last block will be filled */
2327 if (offset <= zerofrom) {
2328 goto out;
2330 if (zerofrom & (blocksize-1)) {
2331 *bytes |= (blocksize-1);
2332 (*bytes)++;
2334 len = offset - zerofrom;
2336 err = pagecache_write_begin(file, mapping, curpos, len,
2337 AOP_FLAG_UNINTERRUPTIBLE,
2338 &page, &fsdata);
2339 if (err)
2340 goto out;
2341 zero_user(page, zerofrom, len);
2342 err = pagecache_write_end(file, mapping, curpos, len, len,
2343 page, fsdata);
2344 if (err < 0)
2345 goto out;
2346 BUG_ON(err != len);
2347 err = 0;
2349 out:
2350 return err;
2354 * For moronic filesystems that do not allow holes in file.
2355 * We may have to extend the file.
2357 int cont_write_begin(struct file *file, struct address_space *mapping,
2358 loff_t pos, unsigned len, unsigned flags,
2359 struct page **pagep, void **fsdata,
2360 get_block_t *get_block, loff_t *bytes)
2362 struct inode *inode = mapping->host;
2363 unsigned blocksize = 1 << inode->i_blkbits;
2364 unsigned zerofrom;
2365 int err;
2367 err = cont_expand_zero(file, mapping, pos, bytes);
2368 if (err)
2369 goto out;
2371 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2372 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2373 *bytes |= (blocksize-1);
2374 (*bytes)++;
2377 *pagep = NULL;
2378 err = block_write_begin(file, mapping, pos, len,
2379 flags, pagep, fsdata, get_block);
2380 out:
2381 return err;
2383 EXPORT_SYMBOL(cont_write_begin);
2385 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2386 get_block_t *get_block)
2388 struct inode *inode = page->mapping->host;
2389 int err = __block_prepare_write(inode, page, from, to, get_block);
2390 if (err)
2391 ClearPageUptodate(page);
2392 return err;
2394 EXPORT_SYMBOL(block_prepare_write);
2396 int block_commit_write(struct page *page, unsigned from, unsigned to)
2398 struct inode *inode = page->mapping->host;
2399 __block_commit_write(inode,page,from,to);
2400 return 0;
2402 EXPORT_SYMBOL(block_commit_write);
2405 * block_page_mkwrite() is not allowed to change the file size as it gets
2406 * called from a page fault handler when a page is first dirtied. Hence we must
2407 * be careful to check for EOF conditions here. We set the page up correctly
2408 * for a written page which means we get ENOSPC checking when writing into
2409 * holes and correct delalloc and unwritten extent mapping on filesystems that
2410 * support these features.
2412 * We are not allowed to take the i_mutex here so we have to play games to
2413 * protect against truncate races as the page could now be beyond EOF. Because
2414 * vmtruncate() writes the inode size before removing pages, once we have the
2415 * page lock we can determine safely if the page is beyond EOF. If it is not
2416 * beyond EOF, then the page is guaranteed safe against truncation until we
2417 * unlock the page.
2420 block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2421 get_block_t get_block)
2423 struct page *page = vmf->page;
2424 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2425 unsigned long end;
2426 loff_t size;
2427 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2429 lock_page(page);
2430 size = i_size_read(inode);
2431 if ((page->mapping != inode->i_mapping) ||
2432 (page_offset(page) > size)) {
2433 /* page got truncated out from underneath us */
2434 unlock_page(page);
2435 goto out;
2438 /* page is wholly or partially inside EOF */
2439 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2440 end = size & ~PAGE_CACHE_MASK;
2441 else
2442 end = PAGE_CACHE_SIZE;
2444 ret = block_prepare_write(page, 0, end, get_block);
2445 if (!ret)
2446 ret = block_commit_write(page, 0, end);
2448 if (unlikely(ret)) {
2449 unlock_page(page);
2450 if (ret == -ENOMEM)
2451 ret = VM_FAULT_OOM;
2452 else /* -ENOSPC, -EIO, etc */
2453 ret = VM_FAULT_SIGBUS;
2454 } else
2455 ret = VM_FAULT_LOCKED;
2457 out:
2458 return ret;
2460 EXPORT_SYMBOL(block_page_mkwrite);
2463 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2464 * immediately, while under the page lock. So it needs a special end_io
2465 * handler which does not touch the bh after unlocking it.
2467 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2469 __end_buffer_read_notouch(bh, uptodate);
2473 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2474 * the page (converting it to circular linked list and taking care of page
2475 * dirty races).
2477 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2479 struct buffer_head *bh;
2481 BUG_ON(!PageLocked(page));
2483 spin_lock(&page->mapping->private_lock);
2484 bh = head;
2485 do {
2486 if (PageDirty(page))
2487 set_buffer_dirty(bh);
2488 if (!bh->b_this_page)
2489 bh->b_this_page = head;
2490 bh = bh->b_this_page;
2491 } while (bh != head);
2492 attach_page_buffers(page, head);
2493 spin_unlock(&page->mapping->private_lock);
2497 * On entry, the page is fully not uptodate.
2498 * On exit the page is fully uptodate in the areas outside (from,to)
2500 int nobh_write_begin(struct file *file, struct address_space *mapping,
2501 loff_t pos, unsigned len, unsigned flags,
2502 struct page **pagep, void **fsdata,
2503 get_block_t *get_block)
2505 struct inode *inode = mapping->host;
2506 const unsigned blkbits = inode->i_blkbits;
2507 const unsigned blocksize = 1 << blkbits;
2508 struct buffer_head *head, *bh;
2509 struct page *page;
2510 pgoff_t index;
2511 unsigned from, to;
2512 unsigned block_in_page;
2513 unsigned block_start, block_end;
2514 sector_t block_in_file;
2515 int nr_reads = 0;
2516 int ret = 0;
2517 int is_mapped_to_disk = 1;
2519 index = pos >> PAGE_CACHE_SHIFT;
2520 from = pos & (PAGE_CACHE_SIZE - 1);
2521 to = from + len;
2523 page = grab_cache_page_write_begin(mapping, index, flags);
2524 if (!page)
2525 return -ENOMEM;
2526 *pagep = page;
2527 *fsdata = NULL;
2529 if (page_has_buffers(page)) {
2530 unlock_page(page);
2531 page_cache_release(page);
2532 *pagep = NULL;
2533 return block_write_begin(file, mapping, pos, len, flags, pagep,
2534 fsdata, get_block);
2537 if (PageMappedToDisk(page))
2538 return 0;
2541 * Allocate buffers so that we can keep track of state, and potentially
2542 * attach them to the page if an error occurs. In the common case of
2543 * no error, they will just be freed again without ever being attached
2544 * to the page (which is all OK, because we're under the page lock).
2546 * Be careful: the buffer linked list is a NULL terminated one, rather
2547 * than the circular one we're used to.
2549 head = alloc_page_buffers(page, blocksize, 0);
2550 if (!head) {
2551 ret = -ENOMEM;
2552 goto out_release;
2555 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2558 * We loop across all blocks in the page, whether or not they are
2559 * part of the affected region. This is so we can discover if the
2560 * page is fully mapped-to-disk.
2562 for (block_start = 0, block_in_page = 0, bh = head;
2563 block_start < PAGE_CACHE_SIZE;
2564 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2565 int create;
2567 block_end = block_start + blocksize;
2568 bh->b_state = 0;
2569 create = 1;
2570 if (block_start >= to)
2571 create = 0;
2572 ret = get_block(inode, block_in_file + block_in_page,
2573 bh, create);
2574 if (ret)
2575 goto failed;
2576 if (!buffer_mapped(bh))
2577 is_mapped_to_disk = 0;
2578 if (buffer_new(bh))
2579 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2580 if (PageUptodate(page)) {
2581 set_buffer_uptodate(bh);
2582 continue;
2584 if (buffer_new(bh) || !buffer_mapped(bh)) {
2585 zero_user_segments(page, block_start, from,
2586 to, block_end);
2587 continue;
2589 if (buffer_uptodate(bh))
2590 continue; /* reiserfs does this */
2591 if (block_start < from || block_end > to) {
2592 lock_buffer(bh);
2593 bh->b_end_io = end_buffer_read_nobh;
2594 submit_bh(READ, bh);
2595 nr_reads++;
2599 if (nr_reads) {
2601 * The page is locked, so these buffers are protected from
2602 * any VM or truncate activity. Hence we don't need to care
2603 * for the buffer_head refcounts.
2605 for (bh = head; bh; bh = bh->b_this_page) {
2606 wait_on_buffer(bh);
2607 if (!buffer_uptodate(bh))
2608 ret = -EIO;
2610 if (ret)
2611 goto failed;
2614 if (is_mapped_to_disk)
2615 SetPageMappedToDisk(page);
2617 *fsdata = head; /* to be released by nobh_write_end */
2619 return 0;
2621 failed:
2622 BUG_ON(!ret);
2624 * Error recovery is a bit difficult. We need to zero out blocks that
2625 * were newly allocated, and dirty them to ensure they get written out.
2626 * Buffers need to be attached to the page at this point, otherwise
2627 * the handling of potential IO errors during writeout would be hard
2628 * (could try doing synchronous writeout, but what if that fails too?)
2630 attach_nobh_buffers(page, head);
2631 page_zero_new_buffers(page, from, to);
2633 out_release:
2634 unlock_page(page);
2635 page_cache_release(page);
2636 *pagep = NULL;
2638 if (pos + len > inode->i_size)
2639 vmtruncate(inode, inode->i_size);
2641 return ret;
2643 EXPORT_SYMBOL(nobh_write_begin);
2645 int nobh_write_end(struct file *file, struct address_space *mapping,
2646 loff_t pos, unsigned len, unsigned copied,
2647 struct page *page, void *fsdata)
2649 struct inode *inode = page->mapping->host;
2650 struct buffer_head *head = fsdata;
2651 struct buffer_head *bh;
2652 BUG_ON(fsdata != NULL && page_has_buffers(page));
2654 if (unlikely(copied < len) && head)
2655 attach_nobh_buffers(page, head);
2656 if (page_has_buffers(page))
2657 return generic_write_end(file, mapping, pos, len,
2658 copied, page, fsdata);
2660 SetPageUptodate(page);
2661 set_page_dirty(page);
2662 if (pos+copied > inode->i_size) {
2663 i_size_write(inode, pos+copied);
2664 mark_inode_dirty(inode);
2667 unlock_page(page);
2668 page_cache_release(page);
2670 while (head) {
2671 bh = head;
2672 head = head->b_this_page;
2673 free_buffer_head(bh);
2676 return copied;
2678 EXPORT_SYMBOL(nobh_write_end);
2681 * nobh_writepage() - based on block_full_write_page() except
2682 * that it tries to operate without attaching bufferheads to
2683 * the page.
2685 int nobh_writepage(struct page *page, get_block_t *get_block,
2686 struct writeback_control *wbc)
2688 struct inode * const inode = page->mapping->host;
2689 loff_t i_size = i_size_read(inode);
2690 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2691 unsigned offset;
2692 int ret;
2694 /* Is the page fully inside i_size? */
2695 if (page->index < end_index)
2696 goto out;
2698 /* Is the page fully outside i_size? (truncate in progress) */
2699 offset = i_size & (PAGE_CACHE_SIZE-1);
2700 if (page->index >= end_index+1 || !offset) {
2702 * The page may have dirty, unmapped buffers. For example,
2703 * they may have been added in ext3_writepage(). Make them
2704 * freeable here, so the page does not leak.
2706 #if 0
2707 /* Not really sure about this - do we need this ? */
2708 if (page->mapping->a_ops->invalidatepage)
2709 page->mapping->a_ops->invalidatepage(page, offset);
2710 #endif
2711 unlock_page(page);
2712 return 0; /* don't care */
2716 * The page straddles i_size. It must be zeroed out on each and every
2717 * writepage invocation because it may be mmapped. "A file is mapped
2718 * in multiples of the page size. For a file that is not a multiple of
2719 * the page size, the remaining memory is zeroed when mapped, and
2720 * writes to that region are not written out to the file."
2722 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2723 out:
2724 ret = mpage_writepage(page, get_block, wbc);
2725 if (ret == -EAGAIN)
2726 ret = __block_write_full_page(inode, page, get_block, wbc,
2727 end_buffer_async_write);
2728 return ret;
2730 EXPORT_SYMBOL(nobh_writepage);
2732 int nobh_truncate_page(struct address_space *mapping,
2733 loff_t from, get_block_t *get_block)
2735 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2736 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2737 unsigned blocksize;
2738 sector_t iblock;
2739 unsigned length, pos;
2740 struct inode *inode = mapping->host;
2741 struct page *page;
2742 struct buffer_head map_bh;
2743 int err;
2745 blocksize = 1 << inode->i_blkbits;
2746 length = offset & (blocksize - 1);
2748 /* Block boundary? Nothing to do */
2749 if (!length)
2750 return 0;
2752 length = blocksize - length;
2753 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2755 page = grab_cache_page(mapping, index);
2756 err = -ENOMEM;
2757 if (!page)
2758 goto out;
2760 if (page_has_buffers(page)) {
2761 has_buffers:
2762 unlock_page(page);
2763 page_cache_release(page);
2764 return block_truncate_page(mapping, from, get_block);
2767 /* Find the buffer that contains "offset" */
2768 pos = blocksize;
2769 while (offset >= pos) {
2770 iblock++;
2771 pos += blocksize;
2774 map_bh.b_size = blocksize;
2775 map_bh.b_state = 0;
2776 err = get_block(inode, iblock, &map_bh, 0);
2777 if (err)
2778 goto unlock;
2779 /* unmapped? It's a hole - nothing to do */
2780 if (!buffer_mapped(&map_bh))
2781 goto unlock;
2783 /* Ok, it's mapped. Make sure it's up-to-date */
2784 if (!PageUptodate(page)) {
2785 err = mapping->a_ops->readpage(NULL, page);
2786 if (err) {
2787 page_cache_release(page);
2788 goto out;
2790 lock_page(page);
2791 if (!PageUptodate(page)) {
2792 err = -EIO;
2793 goto unlock;
2795 if (page_has_buffers(page))
2796 goto has_buffers;
2798 zero_user(page, offset, length);
2799 set_page_dirty(page);
2800 err = 0;
2802 unlock:
2803 unlock_page(page);
2804 page_cache_release(page);
2805 out:
2806 return err;
2808 EXPORT_SYMBOL(nobh_truncate_page);
2810 int block_truncate_page(struct address_space *mapping,
2811 loff_t from, get_block_t *get_block)
2813 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2814 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2815 unsigned blocksize;
2816 sector_t iblock;
2817 unsigned length, pos;
2818 struct inode *inode = mapping->host;
2819 struct page *page;
2820 struct buffer_head *bh;
2821 int err;
2823 blocksize = 1 << inode->i_blkbits;
2824 length = offset & (blocksize - 1);
2826 /* Block boundary? Nothing to do */
2827 if (!length)
2828 return 0;
2830 length = blocksize - length;
2831 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2833 page = grab_cache_page(mapping, index);
2834 err = -ENOMEM;
2835 if (!page)
2836 goto out;
2838 if (!page_has_buffers(page))
2839 create_empty_buffers(page, blocksize, 0);
2841 /* Find the buffer that contains "offset" */
2842 bh = page_buffers(page);
2843 pos = blocksize;
2844 while (offset >= pos) {
2845 bh = bh->b_this_page;
2846 iblock++;
2847 pos += blocksize;
2850 err = 0;
2851 if (!buffer_mapped(bh)) {
2852 WARN_ON(bh->b_size != blocksize);
2853 err = get_block(inode, iblock, bh, 0);
2854 if (err)
2855 goto unlock;
2856 /* unmapped? It's a hole - nothing to do */
2857 if (!buffer_mapped(bh))
2858 goto unlock;
2861 /* Ok, it's mapped. Make sure it's up-to-date */
2862 if (PageUptodate(page))
2863 set_buffer_uptodate(bh);
2865 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2866 err = -EIO;
2867 ll_rw_block(READ, 1, &bh);
2868 wait_on_buffer(bh);
2869 /* Uhhuh. Read error. Complain and punt. */
2870 if (!buffer_uptodate(bh))
2871 goto unlock;
2874 zero_user(page, offset, length);
2875 mark_buffer_dirty(bh);
2876 err = 0;
2878 unlock:
2879 unlock_page(page);
2880 page_cache_release(page);
2881 out:
2882 return err;
2884 EXPORT_SYMBOL(block_truncate_page);
2887 * The generic ->writepage function for buffer-backed address_spaces
2888 * this form passes in the end_io handler used to finish the IO.
2890 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2891 struct writeback_control *wbc, bh_end_io_t *handler)
2893 struct inode * const inode = page->mapping->host;
2894 loff_t i_size = i_size_read(inode);
2895 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2896 unsigned offset;
2898 /* Is the page fully inside i_size? */
2899 if (page->index < end_index)
2900 return __block_write_full_page(inode, page, get_block, wbc,
2901 handler);
2903 /* Is the page fully outside i_size? (truncate in progress) */
2904 offset = i_size & (PAGE_CACHE_SIZE-1);
2905 if (page->index >= end_index+1 || !offset) {
2907 * The page may have dirty, unmapped buffers. For example,
2908 * they may have been added in ext3_writepage(). Make them
2909 * freeable here, so the page does not leak.
2911 do_invalidatepage(page, 0);
2912 unlock_page(page);
2913 return 0; /* don't care */
2917 * The page straddles i_size. It must be zeroed out on each and every
2918 * writepage invokation because it may be mmapped. "A file is mapped
2919 * in multiples of the page size. For a file that is not a multiple of
2920 * the page size, the remaining memory is zeroed when mapped, and
2921 * writes to that region are not written out to the file."
2923 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2924 return __block_write_full_page(inode, page, get_block, wbc, handler);
2926 EXPORT_SYMBOL(block_write_full_page_endio);
2929 * The generic ->writepage function for buffer-backed address_spaces
2931 int block_write_full_page(struct page *page, get_block_t *get_block,
2932 struct writeback_control *wbc)
2934 return block_write_full_page_endio(page, get_block, wbc,
2935 end_buffer_async_write);
2937 EXPORT_SYMBOL(block_write_full_page);
2939 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2940 get_block_t *get_block)
2942 struct buffer_head tmp;
2943 struct inode *inode = mapping->host;
2944 tmp.b_state = 0;
2945 tmp.b_blocknr = 0;
2946 tmp.b_size = 1 << inode->i_blkbits;
2947 get_block(inode, block, &tmp, 0);
2948 return tmp.b_blocknr;
2950 EXPORT_SYMBOL(generic_block_bmap);
2952 static void end_bio_bh_io_sync(struct bio *bio, int err)
2954 struct buffer_head *bh = bio->bi_private;
2956 if (err == -EOPNOTSUPP) {
2957 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2958 set_bit(BH_Eopnotsupp, &bh->b_state);
2961 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2962 set_bit(BH_Quiet, &bh->b_state);
2964 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2965 bio_put(bio);
2968 int submit_bh(int rw, struct buffer_head * bh)
2970 struct bio *bio;
2971 int ret = 0;
2973 BUG_ON(!buffer_locked(bh));
2974 BUG_ON(!buffer_mapped(bh));
2975 BUG_ON(!bh->b_end_io);
2976 BUG_ON(buffer_delay(bh));
2977 BUG_ON(buffer_unwritten(bh));
2980 * Mask in barrier bit for a write (could be either a WRITE or a
2981 * WRITE_SYNC
2983 if (buffer_ordered(bh) && (rw & WRITE))
2984 rw |= WRITE_BARRIER;
2987 * Only clear out a write error when rewriting
2989 if (test_set_buffer_req(bh) && (rw & WRITE))
2990 clear_buffer_write_io_error(bh);
2993 * from here on down, it's all bio -- do the initial mapping,
2994 * submit_bio -> generic_make_request may further map this bio around
2996 bio = bio_alloc(GFP_NOIO, 1);
2998 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2999 bio->bi_bdev = bh->b_bdev;
3000 bio->bi_io_vec[0].bv_page = bh->b_page;
3001 bio->bi_io_vec[0].bv_len = bh->b_size;
3002 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3004 bio->bi_vcnt = 1;
3005 bio->bi_idx = 0;
3006 bio->bi_size = bh->b_size;
3008 bio->bi_end_io = end_bio_bh_io_sync;
3009 bio->bi_private = bh;
3011 bio_get(bio);
3012 submit_bio(rw, bio);
3014 if (bio_flagged(bio, BIO_EOPNOTSUPP))
3015 ret = -EOPNOTSUPP;
3017 bio_put(bio);
3018 return ret;
3020 EXPORT_SYMBOL(submit_bh);
3023 * ll_rw_block: low-level access to block devices (DEPRECATED)
3024 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
3025 * @nr: number of &struct buffer_heads in the array
3026 * @bhs: array of pointers to &struct buffer_head
3028 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3029 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3030 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3031 * are sent to disk. The fourth %READA option is described in the documentation
3032 * for generic_make_request() which ll_rw_block() calls.
3034 * This function drops any buffer that it cannot get a lock on (with the
3035 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3036 * clean when doing a write request, and any buffer that appears to be
3037 * up-to-date when doing read request. Further it marks as clean buffers that
3038 * are processed for writing (the buffer cache won't assume that they are
3039 * actually clean until the buffer gets unlocked).
3041 * ll_rw_block sets b_end_io to simple completion handler that marks
3042 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3043 * any waiters.
3045 * All of the buffers must be for the same device, and must also be a
3046 * multiple of the current approved size for the device.
3048 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3050 int i;
3052 for (i = 0; i < nr; i++) {
3053 struct buffer_head *bh = bhs[i];
3055 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
3056 lock_buffer(bh);
3057 else if (!trylock_buffer(bh))
3058 continue;
3060 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3061 rw == SWRITE_SYNC_PLUG) {
3062 if (test_clear_buffer_dirty(bh)) {
3063 bh->b_end_io = end_buffer_write_sync;
3064 get_bh(bh);
3065 if (rw == SWRITE_SYNC)
3066 submit_bh(WRITE_SYNC, bh);
3067 else
3068 submit_bh(WRITE, bh);
3069 continue;
3071 } else {
3072 if (!buffer_uptodate(bh)) {
3073 bh->b_end_io = end_buffer_read_sync;
3074 get_bh(bh);
3075 submit_bh(rw, bh);
3076 continue;
3079 unlock_buffer(bh);
3082 EXPORT_SYMBOL(ll_rw_block);
3085 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3086 * and then start new I/O and then wait upon it. The caller must have a ref on
3087 * the buffer_head.
3089 int sync_dirty_buffer(struct buffer_head *bh)
3091 int ret = 0;
3093 WARN_ON(atomic_read(&bh->b_count) < 1);
3094 lock_buffer(bh);
3095 if (test_clear_buffer_dirty(bh)) {
3096 get_bh(bh);
3097 bh->b_end_io = end_buffer_write_sync;
3098 ret = submit_bh(WRITE_SYNC, bh);
3099 wait_on_buffer(bh);
3100 if (buffer_eopnotsupp(bh)) {
3101 clear_buffer_eopnotsupp(bh);
3102 ret = -EOPNOTSUPP;
3104 if (!ret && !buffer_uptodate(bh))
3105 ret = -EIO;
3106 } else {
3107 unlock_buffer(bh);
3109 return ret;
3111 EXPORT_SYMBOL(sync_dirty_buffer);
3114 * try_to_free_buffers() checks if all the buffers on this particular page
3115 * are unused, and releases them if so.
3117 * Exclusion against try_to_free_buffers may be obtained by either
3118 * locking the page or by holding its mapping's private_lock.
3120 * If the page is dirty but all the buffers are clean then we need to
3121 * be sure to mark the page clean as well. This is because the page
3122 * may be against a block device, and a later reattachment of buffers
3123 * to a dirty page will set *all* buffers dirty. Which would corrupt
3124 * filesystem data on the same device.
3126 * The same applies to regular filesystem pages: if all the buffers are
3127 * clean then we set the page clean and proceed. To do that, we require
3128 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3129 * private_lock.
3131 * try_to_free_buffers() is non-blocking.
3133 static inline int buffer_busy(struct buffer_head *bh)
3135 return atomic_read(&bh->b_count) |
3136 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3139 static int
3140 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3142 struct buffer_head *head = page_buffers(page);
3143 struct buffer_head *bh;
3145 bh = head;
3146 do {
3147 if (buffer_write_io_error(bh) && page->mapping)
3148 set_bit(AS_EIO, &page->mapping->flags);
3149 if (buffer_busy(bh))
3150 goto failed;
3151 bh = bh->b_this_page;
3152 } while (bh != head);
3154 do {
3155 struct buffer_head *next = bh->b_this_page;
3157 if (bh->b_assoc_map)
3158 __remove_assoc_queue(bh);
3159 bh = next;
3160 } while (bh != head);
3161 *buffers_to_free = head;
3162 __clear_page_buffers(page);
3163 return 1;
3164 failed:
3165 return 0;
3168 int try_to_free_buffers(struct page *page)
3170 struct address_space * const mapping = page->mapping;
3171 struct buffer_head *buffers_to_free = NULL;
3172 int ret = 0;
3174 BUG_ON(!PageLocked(page));
3175 if (PageWriteback(page))
3176 return 0;
3178 if (mapping == NULL) { /* can this still happen? */
3179 ret = drop_buffers(page, &buffers_to_free);
3180 goto out;
3183 spin_lock(&mapping->private_lock);
3184 ret = drop_buffers(page, &buffers_to_free);
3187 * If the filesystem writes its buffers by hand (eg ext3)
3188 * then we can have clean buffers against a dirty page. We
3189 * clean the page here; otherwise the VM will never notice
3190 * that the filesystem did any IO at all.
3192 * Also, during truncate, discard_buffer will have marked all
3193 * the page's buffers clean. We discover that here and clean
3194 * the page also.
3196 * private_lock must be held over this entire operation in order
3197 * to synchronise against __set_page_dirty_buffers and prevent the
3198 * dirty bit from being lost.
3200 if (ret)
3201 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3202 spin_unlock(&mapping->private_lock);
3203 out:
3204 if (buffers_to_free) {
3205 struct buffer_head *bh = buffers_to_free;
3207 do {
3208 struct buffer_head *next = bh->b_this_page;
3209 free_buffer_head(bh);
3210 bh = next;
3211 } while (bh != buffers_to_free);
3213 return ret;
3215 EXPORT_SYMBOL(try_to_free_buffers);
3217 void block_sync_page(struct page *page)
3219 struct address_space *mapping;
3221 smp_mb();
3222 mapping = page_mapping(page);
3223 if (mapping)
3224 blk_run_backing_dev(mapping->backing_dev_info, page);
3226 EXPORT_SYMBOL(block_sync_page);
3229 * There are no bdflush tunables left. But distributions are
3230 * still running obsolete flush daemons, so we terminate them here.
3232 * Use of bdflush() is deprecated and will be removed in a future kernel.
3233 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3235 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3237 static int msg_count;
3239 if (!capable(CAP_SYS_ADMIN))
3240 return -EPERM;
3242 if (msg_count < 5) {
3243 msg_count++;
3244 printk(KERN_INFO
3245 "warning: process `%s' used the obsolete bdflush"
3246 " system call\n", current->comm);
3247 printk(KERN_INFO "Fix your initscripts?\n");
3250 if (func == 1)
3251 do_exit(0);
3252 return 0;
3256 * Buffer-head allocation
3258 static struct kmem_cache *bh_cachep;
3261 * Once the number of bh's in the machine exceeds this level, we start
3262 * stripping them in writeback.
3264 static int max_buffer_heads;
3266 int buffer_heads_over_limit;
3268 struct bh_accounting {
3269 int nr; /* Number of live bh's */
3270 int ratelimit; /* Limit cacheline bouncing */
3273 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3275 static void recalc_bh_state(void)
3277 int i;
3278 int tot = 0;
3280 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3281 return;
3282 __get_cpu_var(bh_accounting).ratelimit = 0;
3283 for_each_online_cpu(i)
3284 tot += per_cpu(bh_accounting, i).nr;
3285 buffer_heads_over_limit = (tot > max_buffer_heads);
3288 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3290 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3291 if (ret) {
3292 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3293 get_cpu_var(bh_accounting).nr++;
3294 recalc_bh_state();
3295 put_cpu_var(bh_accounting);
3297 return ret;
3299 EXPORT_SYMBOL(alloc_buffer_head);
3301 void free_buffer_head(struct buffer_head *bh)
3303 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3304 kmem_cache_free(bh_cachep, bh);
3305 get_cpu_var(bh_accounting).nr--;
3306 recalc_bh_state();
3307 put_cpu_var(bh_accounting);
3309 EXPORT_SYMBOL(free_buffer_head);
3311 static void buffer_exit_cpu(int cpu)
3313 int i;
3314 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3316 for (i = 0; i < BH_LRU_SIZE; i++) {
3317 brelse(b->bhs[i]);
3318 b->bhs[i] = NULL;
3320 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3321 per_cpu(bh_accounting, cpu).nr = 0;
3322 put_cpu_var(bh_accounting);
3325 static int buffer_cpu_notify(struct notifier_block *self,
3326 unsigned long action, void *hcpu)
3328 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3329 buffer_exit_cpu((unsigned long)hcpu);
3330 return NOTIFY_OK;
3334 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3335 * @bh: struct buffer_head
3337 * Return true if the buffer is up-to-date and false,
3338 * with the buffer locked, if not.
3340 int bh_uptodate_or_lock(struct buffer_head *bh)
3342 if (!buffer_uptodate(bh)) {
3343 lock_buffer(bh);
3344 if (!buffer_uptodate(bh))
3345 return 0;
3346 unlock_buffer(bh);
3348 return 1;
3350 EXPORT_SYMBOL(bh_uptodate_or_lock);
3353 * bh_submit_read - Submit a locked buffer for reading
3354 * @bh: struct buffer_head
3356 * Returns zero on success and -EIO on error.
3358 int bh_submit_read(struct buffer_head *bh)
3360 BUG_ON(!buffer_locked(bh));
3362 if (buffer_uptodate(bh)) {
3363 unlock_buffer(bh);
3364 return 0;
3367 get_bh(bh);
3368 bh->b_end_io = end_buffer_read_sync;
3369 submit_bh(READ, bh);
3370 wait_on_buffer(bh);
3371 if (buffer_uptodate(bh))
3372 return 0;
3373 return -EIO;
3375 EXPORT_SYMBOL(bh_submit_read);
3377 static void
3378 init_buffer_head(void *data)
3380 struct buffer_head *bh = data;
3382 memset(bh, 0, sizeof(*bh));
3383 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3386 void __init buffer_init(void)
3388 int nrpages;
3390 bh_cachep = kmem_cache_create("buffer_head",
3391 sizeof(struct buffer_head), 0,
3392 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3393 SLAB_MEM_SPREAD),
3394 init_buffer_head);
3397 * Limit the bh occupancy to 10% of ZONE_NORMAL
3399 nrpages = (nr_free_buffer_pages() * 10) / 100;
3400 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3401 hotcpu_notifier(buffer_cpu_notify, 0);